Get values from corresponding row CSV Python
我有多个这样的csv文件:
CSV1:
1 2 3 | h1,h2,h3 aa,34,bd9 bb,459,jg0 |
CSV2:
1 2 3 4 | h1,h5,h2 aa,rg,87 aa,gru,90 bb,sf,459 |
对于标题为h1的列0中的每个值,我想从文件夹中的所有csv文件中获取其相应的h2值。样本输出可以是
1 2 | csv1: (aa,34),(bb,459) csv2: (aa,87,90),(bb,459) |
我有点不知道该怎么做。
我不想用熊猫。
pps-我可以通过硬编码列0的值来实现,但我不想这样做,因为有数百行。
这是我试过的一小段代码。它以不同的行打印"aa"的h2值。我想把它们印在同一行。
1 2 3 4 5 | import csv with open("test1/sample.csv") as csvfile: reader = csv.DictReader(csvfile, delimiter =",") for row in reader: print(row['h1'], row['h2']) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import glob import csv import os from collections import defaultdict d = defaultdict(list) path ="path_to_folder" for fle in (glob.glob("*.csv")): with open(os.path.join(path,fle)) as f: header = next(f).rstrip().split(",") # if either does not appear in header the value will be None h1 = next((i for i, x in enumerate(header) if x =="h1"),None) h2 = next((i for i, x in enumerate(header) if x =="h2"),None) # make sure we have both columns before going further if h1 is not None and h2 is not None: r = csv.reader(f,delimiter=",") # save file name as key appending each h1 and h2 value for row in r: d[fle].append([row[h1],row[h2]]) print(d) defaultdict(<class 'list'>, {'csv1.csv': [['aa', '34'], ['bb', '459']], 'csv2.csv': [['aa', '87'], ['aa', '90'], ['bb', '459']]}) |
这是一个快速草稿,它假定所有文件都由
要获取一组唯一值,我们可以使用set和set.update:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | d = defaultdict(set) # change to set for fle in (glob.glob("*.csv")): with open(os.path.join(path,fle)) as f: header = next(f).rstrip().split(",") h1 = next((i for i, x in enumerate(header) if x =="h1"),None) h2 = next((i for i, x in enumerate(header) if x =="h2"),None) if h1 is not None and h2 is not None: r = csv.reader(f,delimiter=",") for row in r: d[fle].update([row[h1],row[h2]) # set.update print(d) defaultdict(<class 'set'>, {'csv1.csv': {'459', '34', 'bb', 'aa'}, 'csv2.csv': {'459', '90', '87', 'bb', 'aa'}}) |
如果您确定始终有h1和h2,可以将代码简化为:
1 2 3 4 5 6 7 8 9 10 | d = defaultdict(set) path ="path/" for fle in (glob.glob("*.csv")): with open(os.path.join(path, fle)) as f: r = csv.reader(f,delimiter=",") header = next(r) h1 = header.index("h1") h2 = header.index("h2") for row in r: d[fle].update([row[h1], row[h2]]) |
最后,如果要保持元素被找到的顺序,我们不能使用集合,因为它们是无序的,因此我们需要检查是否其中一个元素已经在列表中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | for fle in (glob.glob("*.csv")): with open(os.path.join(path, fle)) as f: r = csv.reader(f,delimiter=",") header = next(r) h1 = header.index("h1") h2 = header.index("h2") for row in r: h_1, h_2 = row[h1], row[h2] if h_1 not in d[fle]: d[fle].append(h_1) if h_2 not in d[fle]: d[fle].append(h_2) print(d) defaultdict(<class 'list'>, {'csv2.csv': ['aa', '87', '90', 'bb', '459'], 'csv1.csv': ['aa', '34', 'bb', '459']}) |