Create dictionary of value frequencies from numpy array
我需要从一个循环中创建一个字典,该循环贯穿两列数字的数组。下面是数组的一个子集:
1 2 3 4 5 | array([[ 0, 1], [ 1, 0], [ 1, 2], [ 2, 3], [ 2, 1]]) |
我想创建一个字典,它将第一列的唯一数字作为键(例如本例中的0,1,2),将第二列的相应数字作为值。
对于本例,字典如下所示:
1 | dict = {0:[1], 1:[0,2], 2:[3,1]} |
号
我的数组非常长(370000 x 2),所以我想通过一个有效的循环来完成这项工作。任何建议都将不胜感激!
您可以使用
1 2 3 4 5 | from collections import defaultdict a = np.array([[ 0, 1],[ 1, 0],[ 1, 2],[ 2, 3], [ 2, 1]]) d = defaultdict(list) for x,y in a: d[x].append(y) |
纯麻木的解决方案:
1 2 3 4 | b=a[np.lexsort(a.T[::-1])] # if necessary. keys,values=b.T uniq,steps=np.unique(keys,return_index =True) bins=np.split(values,steps[1:]) |
号
如果是
其他:
1 2 | d=dict(zip(uniq,bins)) #{0: array([1]), 1: array([0, 2]), 2: array([1, 3])} |
号
将建立您的字典。
如果第一列是"带重复的范围"
1 2 | steps_at = np.searchsorted(a[:,0], np.arange(a[-1,0]+1)) result = {k:v for k,v in zip(a[steps_at,0], np.split(a[:,1], steps_at[1:]))} |
如果您的第一列具有聚集但未排序的相等项
1 2 | steps_at = np.where(np.diff(np.r_[np.nan, a[:,0]]))[0] return {k:v for k,v in zip(a[steps_at,0], np.split(a[:,1], steps_at[1:]))} |
。
一般情况
1 2 3 4 | ind = np.argsort(a[:, 0], kind='mergesort') aa = a[ind, 0] steps_at = np.where(np.diff(np.r_[np.nan, aa]))[0] return {k:v for k,v in zip(aa[steps_at], np.split(a[ind,1], steps_at[1:]))} |
射击:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | (19, 2) correctness Psidom {0: [0, 28, 38, 97, 99, 65, 73], 1: [64, 91, 70, 40, 9], 2: [94, 96, 69, 46], 3: [85, 15, 65]} Daniel_Jimenez defaultdict(<class 'list'>, {0: [0, 28, 38, 97, 99, 65, 73], 1: [64, 91, 70, 40, 9], 2: [94, 96, 69, 46], 3: [85, 15, 65]}) Jean_Francois_Fabre {0: [0, 28, 38, 97, 99, 65, 73], 1: [64, 91, 70, 40, 9], 2: [94, 96, 69, 46], 3: [85, 15, 65]} Alexandre_Kempf {0: array([ 0, 28, 38, 97, 99, 65, 73]), 1: array([64, 91, 70, 40, 9]), 2: array([94, 96, 69, 46]), 3: array([85, 15, 65])} Or_Duan {0: [0, 28, 38, 97, 99, 65, 73], 1: [64, 91, 70, 40, 9], 2: [94, 96, 69, 46], 3: [85, 15, 65]} Paul_Panzer_sorted {0: array([ 0, 28, 38, 97, 99, 65, 73]), 1: array([64, 91, 70, 40, 9]), 2: array([94, 96, 69, 46]), 3: array([85, 15, 65])} Paul_Panzer_grouped {0: array([ 0, 28, 38, 97, 99, 65, 73]), 1: array([64, 91, 70, 40, 9]), 2: array([94, 96, 69, 46]), 3: array([85, 15, 65])} Paul_Panzer_general {0: array([ 0, 28, 38, 97, 99, 65, 73]), 1: array([64, 91, 70, 40, 9]), 2: array([94, 96, 69, 46]), 3: array([85, 15, 65])} B_M_sorted {0: array([ 0, 28, 38, 97, 99, 65, 73]), 1: array([64, 91, 70, 40, 9]), 2: array([94, 96, 69, 46]), 3: array([85, 15, 65])} B_M_general {0: array([ 0, 28, 38, 65, 73, 97, 99]), 1: array([ 9, 40, 64, 70, 91]), 2: array([46, 69, 94, 96]), 3: array([15, 65, 85])} (40194, 2) speed (seconds used for 10 repeats) Psidom 0.4336233548820019 Daniel_Jimenez 0.3609276609495282 Jean_Francois_Fabre 0.17962428089231253 Alexandre_Kempf 3.5392782238777727 Or_Duan 0.1873011060524732 Paul_Panzer_sorted 0.08001555898226798 Paul_Panzer_grouped 0.08144942414946854 Paul_Panzer_general 0.10183193604461849 B_M_sorted 0.09192353091202676 B_M_general 0.16612185980193317 (400771, 2) speed (seconds used for 10 repeats) Psidom 3.968917251098901 Daniel_Jimenez 3.619185874937102 Jean_Francois_Fabre 1.7871235068887472 Or_Duan 1.9176530800759792 Paul_Panzer_sorted 0.8291062880307436 Paul_Panzer_grouped 0.8662846579682082 Paul_Panzer_general 1.0812653130851686 B_M_sorted 1.031000167131424 B_M_general 2.16174431797117 Alexandre_Kempf 513.2718367418274 |
。
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | from collections import defaultdict from itertools import groupby import numpy as np import timeit Psidom = lambda a: {k: [v for _, v in g] for k, g in groupby(a, lambda x: x[0])} def Daniel_Jimenez(a): d = defaultdict(list) for x,y in a: d[x].append(y) return d Jean_Francois_Fabre = lambda a: {k:[i[1] for i in v] for k,v in groupby(a,lambda x : x[0])} def Alexandre_Kempf(a): keys = a[:,0] items = a[:,1] uniqkey = np.unique(keys) prelist = [items[keys==i] for i in uniqkey] dico = {} for i in np.arange(len(uniqkey)): dico[uniqkey[i]] = prelist[i] return dico def Or_Duan(a): default = {} for elm in a: try: default[elm[0]].append(elm[1]) except KeyError: default[elm[0]] = [elm[1]] return default def Paul_Panzer_sorted(a): steps_at = np.searchsorted(a[:,0], np.arange(a[-1,0]+1)) return {k:v for k,v in zip(a[steps_at,0], np.split(a[:,1], steps_at[1:]))} def Paul_Panzer_grouped(a): steps_at = np.where(np.diff(np.r_[np.nan, a[:,0]]))[0] return {k:v for k,v in zip(a[steps_at,0], np.split(a[:,1], steps_at[1:]))} def Paul_Panzer_general(a): ind = np.argsort(a[:, 0], kind='mergesort') aa = a[ind, 0] steps_at = np.where(np.diff(np.r_[np.nan, aa]))[0] return {k:v for k,v in zip(aa[steps_at], np.split(a[ind,1], steps_at[1:]))} def B_M_sorted(b): keys,values=b.T uniq,steps=np.unique(keys,return_index =True) bins=np.split(values,steps[1:]) return dict(zip(uniq,bins)) def B_M_general(a): b=a[np.lexsort(a.T[::-1])] keys,values=b.T uniq,steps=np.unique(keys,return_index =True) bins=np.split(values,steps[1:]) return dict(zip(uniq,bins)) c = np.arange(4).repeat(np.random.randint(1,10,(4))) d = np.random.randint(100, size=c.shape) t = np.c_[c, d] c = np.arange(8000).repeat(np.random.randint(1,10,(8000))) d = np.random.randint(100, size=c.shape) a = np.c_[c, d] c = np.arange(80000).repeat(np.random.randint(1,10,(80000))) d = np.random.randint(100, size=c.shape) b = np.c_[c, d] print(t.shape, 'correctness ') i = 0 for f in (Psidom, Daniel_Jimenez, Jean_Francois_Fabre, Alexandre_Kempf, Or_Duan, Paul_Panzer_sorted, Paul_Panzer_grouped, Paul_Panzer_general, B_M_sorted, B_M_general): name = f.__name__ if name == '<lambda>': name = ['Psidom', 'Jean_Francois_Fabre'][i] i += 1 print(name + (20 - len(name)) * ' ', f(t)) print(' ', a.shape, 'speed (seconds used for 10 repeats) ') i = 0 for f in (Psidom, Daniel_Jimenez, Jean_Francois_Fabre, Alexandre_Kempf, Or_Duan, Paul_Panzer_sorted, Paul_Panzer_grouped, Paul_Panzer_general, B_M_sorted, B_M_general): name = f.__name__ if name == '<lambda>': name = ['Psidom', 'Jean_Francois_Fabre'][i] i += 1 print(name + (20 - len(name)) * ' ', timeit.timeit("f(a)",number=10, globals={'f':f, 'a':a})) print(' ', b.shape, 'speed (seconds used for 10 repeats) ') i = 0 for f in (Psidom, Daniel_Jimenez, Jean_Francois_Fabre, Or_Duan, Paul_Panzer_sorted, Paul_Panzer_grouped, Paul_Panzer_general, B_M_sorted, B_M_general, Alexandre_Kempf): name = f.__name__ if name == '<lambda>': name = ['Psidom', 'Jean_Francois_Fabre'][i] i += 1 print(name + (20 - len(name)) * ' ', timeit.timeit("f(a)",number=10, globals={'f':f, 'a':b})) |
号
很好的一行程序:
1 2 3 4 5 6 7 8 9 | import itertools array = [[ 0, 1], [ 1, 0], [ 1, 2], [ 2, 3], [ 2, 1]] d = {k:[i[1] for i in v] for k,v in itertools.groupby(sorted(array),lambda x : x[0])} |
结果:
1 | {0: [1], 1: [0, 2], 2: [1, 3]} |
。
- 按列表已排序版本上的第一个项目分组(如果尚未进行排序)
- 在dict理解中创建字典,只使用分组项的第二个元素构建值列表
假设数组已按第一列排序,则可以使用groupby:
1 2 3 | from itertools import groupby {k: [v for _, v in g] for k, g in groupby(arr, lambda x: x[0])} # {0: [1], 1: [0, 2], 2: [3, 1]} |
1 2 3 4 5 | #arr = np.array([[ 0, 1], # [ 1, 0], # [ 1, 2], # [ 2, 3], # [ 2, 1]]) |
号
与
1 2 3 4 5 6 7 8 9 10 11 | array = [[ 0, 1], [ 1, 0], [ 1, 2], [ 2, 3], [ 2, 1]] default = {} for elm in array: try: default[elm[0]].append(elm[1]) except KeyError: default[elm[0]] = [elm[1]] |
。
一种解决方案是这样做(如果a是您的数组):
1 2 3 4 5 6 7 | keys = a[:,0] items = a[:,1] uniqkey = np.unique(keys) prelist = [items[keys==i] for i in uniqkey] dico = {} for i in np.arange(len(uniqkey)): dico[uniqkey[i]] = prelist[i] |
。