关于内存：Python MemoryError – 有没有更有效的方法来处理我庞大的CSV文件？

Python MemoryError - Is there a more efficient way of working with my huge CSV file?

[使用Python3.3]我有一个巨大的CSV文件，其中包含XX万行并包含几列。我想读取该文件，添加几个计算列并吐出几个"分段"的csv文件。我在下面的代码中尝试了一个较小的测试文件，它完全符合我的要求。但现在我正在加载原始的CSV文件(大约3.2 GB)，我收到内存错误。是否有更高效的内存编写以下代码？

请注意，我是Python的新手，因此可能有很多我不太了解的东西。

输入数据示例：

1
2
3
4
5

email cc nr_of_transactions last_transaction_date timebucket total_basket
email1@email.com us 2 datetime value 1 20.29
email2@email.com gb 3 datetime value 2 50.84
email3@email.com ca 5 datetime value 3 119.12
... ... ... ... ... ...

这是我的代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

import csv
import scipy.stats as stats
import itertools
from operator import itemgetter

def add_rankperc(filename):
'''
Function that calculates percentile rank of total basket value of a user (i.e. email) within a country. Next, it assigns the user to a rankbucket based on its percentile rank, using the following rules:
Percentage rank between 75 and 100 -> top25
Percentage rank between 25 and 74 -> mid50
Percentage rank between 0 and 24 -> bottom25
'''

# Defining headers for ease of use/DictReader
headers = ['email', 'cc', 'nr_transactions', 'last_transaction_date', 'timebucket', 'total_basket']
groups = []

with open(filename, encoding='utf-8', mode='r') as f_in:
# Input file is tab-separated, hence dialect='excel-tab'
r = csv.DictReader(f_in, dialect='excel-tab', fieldnames=headers)
# DictReader reads all dict values as strings, converting total_basket to a float
dict_list = []
for row in r:
row['total_basket'] = float(row['total_basket'])
# Append row to a list (of dictionaries) for further processing
dict_list.append(row)

# Groupby function on cc and total_basket
for key, group in itertools.groupby(sorted(dict_list, key=itemgetter('cc', 'total_basket')), key=itemgetter('cc')):
rows = list(group)
for row in rows:
# Calculates the percentile rank for each value for each country
row['rankperc'] = stats.percentileofscore([row['total_basket'] for row in rows], row['total_basket'])
# Percentage rank between 75 and 100 -> top25
if 75 <= row['rankperc'] <= 100:
row['rankbucket'] = 'top25'
# Percentage rank between 25 and 74 -> mid50
elif 25 <= row['rankperc'] < 75:
row['rankbucket'] = 'mid50'
# Percentage rank between 0 and 24 -> bottom25
else:
row['rankbucket'] = 'bottom25'
# Appending all rows to a list to be able to return it and use it in another function
groups.append(row)
return groups

def filter_n_write(data):
'''
Function takes input data, groups by specified keys and outputs only the e-mail addresses to csv files as per the respective grouping.
'''

# Creating group iterator based on keys
for key, group in itertools.groupby(sorted(data, key=itemgetter('timebucket', 'rankbucket')), key=itemgetter('timebucket', 'rankbucket')):
# List comprehension to create a list of lists of email addresses. One row corresponds to the respective combination of grouping keys.
emails = list([row['email'] for row in group])
# Dynamically naming output file based on grouping keys
f_out = 'output-{}-{}.csv'.format(key[0], key[1])
with open(f_out, encoding='utf-8', mode='w') as fout:
w = csv.writer(fout, dialect='excel', lineterminator='
')
# Writerows using list comprehension to write each email in emails iterator (i.e. one address per row). Wrapping email in brackets to write full address in one cell.
w.writerows([email] for email in emails)

filter_n_write(add_rankperc('infile.tsv'))

提前致谢！

相关讨论

我同意Inbar Rose认为使用数据库功能会更好
攻击这个问题。假设我们需要回答你提出的问题，
虽然 - 我认为我们可以牺牲速度。

你可能在构造所有行的列表时内存不足'
字典。我们可以通过仅考虑行的子集来解决这个问题
一次。

这是第一步的代码 - 大致是你的add_rankperc函数：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

import csv
from scipy.stats import percentileofscore
from operator import itemgetter

# Run through the whole file once, saving each row to a file corresponding to
# its 'cc' column
cc_dict = {}
with open(input_path, encoding="utf-8", mode='r') as infile:
csv_reader = csv.reader(infile, dialect="excel-tab")
for row in csv_reader:
cc = row[1]
if cc not in cc_dict:
intermediate_path ="intermediate_cc_{}.txt".format(cc)
outfile = open(intermediate_path, mode='w', newline='')
csv_writer = csv.writer(outfile)
cc_dict[cc] = (intermediate_path, outfile, csv_writer)
_ = cc_dict[cc][2].writerow(row)

# Close the output files
for cc in cc_dict.keys():
cc_dict[cc][1].close()

# Run through the whole file once for each 'cc' value
for cc in cc_dict.keys():
intermediate_path = cc_dict[cc][0]
with open(intermediate_path, mode='r', newline='') as infile:
csv_reader = csv.reader(infile)
# Pick out all of the rows with the 'cc' value under consideration
group = [row for row in csv_reader if row[1] == cc]
# Get the 'total_basket' values for the group
A_scores = [float(row[5]) for row in group]
for row in group:
# Compute this row's 'total_basket' score based on the rest of the
# group's
p = percentileofscore(A_scores, float(row[5]))
row.append(p)
# Categorize the score
bucket = ("bottom25" if p < 25 else ("mid50" if p < 75 else"top100"))
row.append(bucket)
# Save the augmented rows to an intermediate file
with open(output_path, mode='a', newline='') as outfile:
csv_writer = csv.writer(outfile)
csv_writer.writerows(group)

4600万行很多，所以这可能会很慢。我避免使用
csv模块的DictReader功能，只是为行编制索引
直接避免这种开销。我还计算了第一个参数
percentileofscores每组一次，而不是组中的每一行。

如果这样可行，那么我认为你可以对filter_n_write采用相同的想法
function - 运行生成的中间文件一次，挑选出来
(timebucket, rank)对。然后再次访问中间文件，一次
每一对。