Python multiprocessing pool.map for multiple arguments
在python多处理库中,是否有pool.map的变体支持多个参数?
1 2 3 4 5 6 7 8 9 10 11 | text ="test" def harvester(text, case): X = case[0] text+ str(X) if __name__ == '__main__': pool = multiprocessing.Pool(processes=6) case = RAW_DATASET pool.map(harvester(text,case),case, 1) pool.close() pool.join() |
is there a variant of pool.map which support multiple arguments?
python 3.3包括
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | #!/usr/bin/env python3 from functools import partial from itertools import repeat from multiprocessing import Pool, freeze_support def func(a, b): return a + b def main(): a_args = [1,2,3] second_arg = 1 with Pool() as pool: L = pool.starmap(func, [(1, 1), (2, 1), (3, 1)]) M = pool.starmap(func, zip(a_args, repeat(second_arg))) N = pool.map(partial(func, b=second_arg), a_args) assert L == M == N if __name__=="__main__": freeze_support() main() |
对于旧版本:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | #!/usr/bin/env python2 import itertools from multiprocessing import Pool, freeze_support def func(a, b): print a, b def func_star(a_b): """Convert `f([1,2])` to `f(1,2)` call.""" return func(*a_b) def main(): pool = Pool() a_args = [1,2,3] second_arg = 1 pool.map(func_star, itertools.izip(a_args, itertools.repeat(second_arg))) if __name__=="__main__": freeze_support() main() |
产量
1 2 3 | 1 1 2 1 3 1 |
注意这里是如何使用
由于@unutbu提到的bug,您不能在python 2.6上使用
答案取决于版本和情况。J.F.Sebastian.1首先描述了最近版本的python(自3.3以来)最一般的答案,它使用了
1 2 3 4 5 6 7 8 9 10 11 12 13 | import multiprocessing from itertools import product def merge_names(a, b): return '{} & {}'.format(a, b) if __name__ == '__main__': names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie'] with multiprocessing.Pool(processes=3) as pool: results = pool.starmap(merge_names, product(names, repeat=2)) print(results) # Output: ['Brown & Brown', 'Brown & Wilson', 'Brown & Bartlett', ... |
对于早期版本的Python,您需要编写一个助手函数来显式地解包参数。如果您想使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import multiprocessing from itertools import product from contextlib import contextmanager def merge_names(a, b): return '{} & {}'.format(a, b) def merge_names_unpack(args): return merge_names(*args) @contextmanager def poolcontext(*args, **kwargs): pool = multiprocessing.Pool(*args, **kwargs) yield pool pool.terminate() if __name__ == '__main__': names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie'] with poolcontext(processes=3) as pool: results = pool.map(merge_names_unpack, product(names, repeat=2)) print(results) # Output: ['Brown & Brown', 'Brown & Wilson', 'Brown & Bartlett', ... |
在更简单的情况下,使用固定的第二个参数,也可以使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import multiprocessing from functools import partial from contextlib import contextmanager @contextmanager def poolcontext(*args, **kwargs): pool = multiprocessing.Pool(*args, **kwargs) yield pool pool.terminate() def merge_names(a, b): return '{} & {}'.format(a, b) if __name__ == '__main__': names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie'] with poolcontext(processes=3) as pool: results = pool.map(partial(merge_names, b='Sons'), names) print(results) # Output: ['Brown & Sons', 'Wilson & Sons', 'Bartlett & Sons', ... |
1。这其中的大部分灵感来自于他的回答,而他的回答本应被接受。不过,既然这本书一直位居榜首,似乎最好还是为将来的读者改进一下。
我想下面会更好
1 2 3 4 5 6 7 8 9 | def multi_run_wrapper(args): return add(*args) def add(x,y): return x+y if __name__ =="__main__": from multiprocessing import Pool pool = Pool(4) results = pool.map(multi_run_wrapper,[(1,2),(2,3),(3,4)]) print results |
输出
1 | [3, 5, 7] |
在
1 2 3 4 5 6 7 8 9 10 11 12 | from multiprocessing.dummy import Pool as ThreadPool def write(i, x): print(i,"---", x) a = ["1","2","3"] b = ["4","5","6"] pool = ThreadPool(2) pool.starmap(write, zip(a,b)) pool.close() pool.join() |
结果:
1 2 3 | 1 --- 4 2 --- 5 3 --- 6 |
如果您愿意,还可以压缩()更多参数:
如果您希望将常量值作为参数传递,则必须使用
在J.F.Sebastian Answer中了解了Itertools之后,我决定更进一步,编写一个负责并行化的
安装
1 | pip install parmap |
如何并行化:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import parmap # If you want to do: y = [myfunction(x, argument1, argument2) for x in mylist] # In parallel: y = parmap.map(myfunction, mylist, argument1, argument2) # If you want to do: z = [myfunction(x, y, argument1, argument2) for (x,y) in mylist] # In parallel: z = parmap.starmap(myfunction, mylist, argument1, argument2) # If you want to do: listx = [1, 2, 3, 4, 5, 6] listy = [2, 3, 4, 5, 6, 7] param = 3.14 param2 = 42 listz = [] for (x, y) in zip(listx, listy): listz.append(myfunction(x, y, param1, param2)) # In parallel: listz = parmap.starmap(myfunction, zip(listx, listy), param1, param2) |
我已经将parmap上传到pypi和github存储库。
例如,问题的答案如下:
1 2 3 4 5 6 7 8 9 | import parmap def harvester(case, text): X = case[0] text+ str(X) if __name__ =="__main__": case = RAW_DATASET # assuming this is an iterable parmap.map(harvester, case,"test", chunksize=1) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | Python 2.7.5 (default, Sep 30 2013, 20:15:49) [GCC 4.2.1 (Apple Inc. build 5566)] on darwin Type"help","copyright","credits" or"license" for more information. >>> def func(a,b): ... print a,b ... >>> >>> from pathos.multiprocessing import ProcessingPool >>> pool = ProcessingPool(nodes=4) >>> pool.map(func, [1,2,3], [1,1,1]) 1 1 2 1 3 1 [None, None, None] >>> >>> # also can pickle stuff like lambdas >>> result = pool.map(lambda x: x**2, range(10)) >>> result [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] >>> >>> # also does asynchronous map >>> result = pool.amap(pow, [1,2,3], [4,5,6]) >>> result.get() [1, 32, 729] >>> >>> # or can return a map iterator >>> result = pool.imap(pow, [1,2,3], [4,5,6]) >>> result <processing.pool.IMapIterator object at 0x110c2ffd0> >>> list(result) [1, 32, 729] |
可以使用以下两个函数,以避免为每个新函数编写包装器:
1 2 3 4 5 6 7 8 9 | import itertools from multiprocessing import Pool def universal_worker(input_pair): function, args = input_pair return function(*args) def pool_args(function, *args): return zip(itertools.repeat(function), zip(*args)) |
将函数
1 2 3 4 | pool = Pool(n_core) list_model = pool.map(universal_worker, pool_args(function, arg_0, arg_1, arg_2) pool.close() pool.join() |
更好的方法是使用decorator而不是手工编写包装函数。特别是当您有很多函数需要映射时,decorator将避免为每个函数编写包装器,从而节省您的时间。通常装饰的功能是不可选择的,但是我们可以使用
这里举个例子
1 2 3 4 5 6 7 8 9 10 11 12 13 | def unpack_args(func): from functools import wraps @wraps(func) def wrapper(args): if isinstance(args, dict): return func(**args) else: return func(*args) return wrapper @unpack_args def func(x, y): return x + y |
然后你可以用压缩参数映射它
1 2 3 4 5 | np, xlist, ylist = 2, range(10), range(10) pool = Pool(np) res = pool.map(func, zip(xlist, ylist)) pool.close() pool.join() |
当然,您可以在python 3(>=3.3)中使用
另一个简单的选择是将函数参数包装成一个元组,然后将应该传递给元组的参数包装起来。在处理大数据块时,这可能不理想。我相信它会为每个元组制作副本。
1 2 3 4 5 6 7 8 9 10 11 12 | from multiprocessing import Pool def f((a,b,c,d)): print a,b,c,d return a + b + c +d if __name__ == '__main__': p = Pool(10) data = [(i+0,i+1,i+2,i+3) for i in xrange(10)] print(p.map(f, data)) p.close() p.join() |
以随机顺序给出输出:
1 2 3 4 5 6 7 8 9 10 11 | 0 1 2 3 1 2 3 4 2 3 4 5 3 4 5 6 4 5 6 7 5 6 7 8 7 8 9 10 6 7 8 9 8 9 10 11 9 10 11 12 [6, 10, 14, 18, 22, 26, 30, 34, 38, 42] |
一种更好的治疗Python2的方法:
1 2 3 4 5 6 | from multiprocessing import Pool def func((i, (a, b))): print i, a, b return a + b pool = Pool(3) pool.map(func, [(0,(1,2)), (1,(2,3)), (2,(3, 4))]) |
2 3 3
1 2 2
0 1 1
出[ ]:
〔3, 5, 7〕
#"如何接受多个参数"。
1 2 3 4 5 6 7 8 9 10 | def f1(args): a, b, c = args[0] , args[1] , args[2] return a+b+c if __name__ =="__main__": import multiprocessing pool = multiprocessing.Pool(4) result1 = pool.map(f1, [ [1,2,3] ]) print(result1) |
另一种方法是将列表传递给一个参数例程:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | import os from multiprocessing import Pool def task(args): print"PID =", os.getpid(),", arg1 =", args[0],", arg2 =", args[1] pool = Pool() pool.map(task, [ [1,2], [3,4], [5,6], [7,8] ]) |
我们可以用自己喜欢的方法构造参数列表。
从python 3.4.4中,可以使用multiprocessing.get_context()获取上下文对象,以使用多个start方法:
1 2 3 4 5 6 7 8 9 10 11 12 13 | import multiprocessing as mp def foo(q, h, w): q.put(h + ' ' + w) print(h + ' ' + w) if __name__ == '__main__': ctx = mp.get_context('spawn') q = ctx.Queue() p = ctx.Process(target=foo, args=(q,'hello', 'world')) p.start() print(q.get()) p.join() |
或者你只是简单地替换
1 | pool.map(harvester(text,case),case, 1) |
通过:
1 | pool.apply_async(harvester(text,case),case, 1) |
这里有很多答案,但似乎没有一个提供可以在任何版本上工作的python2/3兼容代码。如果您希望您的代码能够正常工作,这对于任何一个Python版本都是有效的:
1 2 3 4 5 6 7 8 9 10 11 | # For python 2/3 compatibility, define pool context manager # to support the 'with' statement in Python 2 if sys.version_info[0] == 2: from contextlib import contextmanager @contextmanager def multiprocessing_context(*args, **kwargs): pool = multiprocessing.Pool(*args, **kwargs) yield pool pool.terminate() else: multiprocessing_context = multiprocessing.Pool |
之后,您可以使用多处理常规的python3方式,不管您喜欢什么。例如:
1 2 3 4 | def _function_to_run_for_each(x): return x.lower() with multiprocessing_context(processes=3) as pool: results = pool.map(_function_to_run_for_each, ['Bob', 'Sue', 'Tim']) print(results) |
将在python2或python3中工作。
在官方文件中,它只支持一个不可辩驳的论点。我喜欢在这种情况下使用Apply_Async。在你的情况下,我会这样做:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | from multiprocessing import Process, Pool, Manager text ="test" def harvester(text, case, q = None): X = case[0] res = text+ str(X) if q: q.put(res) return res def block_until(q, results_queue, until_counter=0): i = 0 while i < until_counter: results_queue.put(q.get()) i+=1 if __name__ == '__main__': pool = multiprocessing.Pool(processes=6) case = RAW_DATASET m = Manager() q = m.Queue() results_queue = m.Queue() # when it completes results will reside in this queue blocking_process = Process(block_until, (q, results_queue, len(case))) blocking_process.start() for c in case: try: res = pool.apply_async(harvester, (text, case, q = None)) res.get(timeout=0.1) except: pass blocking_process.join() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | text ="test" def unpack(args): return args[0](*args[1:]) def harvester(text, case): X = case[0] text+ str(X) if __name__ == '__main__': pool = multiprocessing.Pool(processes=6) case = RAW_DATASET # args is a list of tuples # with the function to execute as the first item in each tuple args = [(harvester, text, c) for c in case] # doing it this way, we can pass any function # and we don't need to define a wrapper for each different function # if we need to use more than one pool.map(unpack, args) pool.close() pool.join() |
这是我用于将多个参数传递给pool.imap fork中使用的单参数函数的例程示例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | from multiprocessing import Pool # Wrapper of the function to map: class makefun: def __init__(self, var2): self.var2 = var2 def fun(self, i): var2 = self.var2 return var1[i] + var2 # Couple of variables for the example: var1 = [1, 2, 3, 5, 6, 7, 8] var2 = [9, 10, 11, 12] # Open the pool: pool = Pool(processes=2) # Wrapper loop for j in range(len(var2)): # Obtain the function to map pool_fun = makefun(var2[j]).fun # Fork loop for i, value in enumerate(pool.imap(pool_fun, range(len(var1))), 0): print(var1[i], '+' ,var2[j], '=', value) # Close the pool pool.close() |
对于python2,你可以使用这个技巧
1 2 3 4 5 6 | def fun(a,b): return a+b pool = multiprocessing.Pool(processes=6) b=233 pool.map(lambda x:fun(x,b),range(1000)) |