关于性能：在c ++中读取由文本文件中的行分隔的数百万个整数的最有效方法是什么

What is the best efficient way to read millions of integers separated by lines from text file in c++

在我的文本文件中，有大约2500万个整数由行分隔。我的第一个任务是获取这些整数并对它们进行排序。实际上，我已经实现了读取整数并将其放入数组中(因为我的排序函数将未排序的数组作为参数)。然而，从文件中读取整数是一个非常长且昂贵的过程。我已经搜索了许多其他的解决方案来获得更便宜和更有效的方法，但是我找不到一个能够解决这种规模问题的解决方案。因此，您的建议是从这个巨大的(大约260MB)文本文件中读取整数。以及如何有效地获得相同问题的行数。

1
2
3
4
5
6
7
8
9
10

ifstream myFile("input.txt");

int currentNumber;
int nItems = 25000000;
int *arr = (int*) malloc(nItems*sizeof(*arr));
int i = 0;
while (myFile >> currentNumber)
{
arr[i++] = currentNumber;
}

这就是我从文本文件中获取整数的方法。这没那么复杂。我假设行数是固定的(实际上是固定的)

顺便说一下，当然不会太慢。它使用2.2Ghz i7处理器在OS X中完成大约9秒的读取。但我觉得会好很多。

相关讨论

最有可能的是，对这一点的任何优化都可能产生相当小的影响。在我的机器上，读取大文件的限制因素是磁盘传输速度。是的，提高阅读速度可以稍微提高一点，但很可能，你不会从中得到太多。

我在以前的一个测试中发现[我会看看是否能在其中找到答案-我在"experiment code for so"目录中找不到源]，最快的方法是使用mmap加载文件。但是它只比使用ifstream稍微快一点。

编辑：我自制的以几种不同方式读取文件的基准。读取文件时获取行，而不是读取整个文件，然后根据换行符拆分

与往常一样，基准测试衡量什么是基准测试，对环境或代码编写方式的微小更改有时会产生很大的差异。

编辑：以下是"从文件中读取数字并将其存储在矢量中"的一些实现：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324

#include <iostream>
#include <fstream>
#include <vector>
#include <sys/time.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>

using namespace std;

const char *file_name ="lots_of_numbers.txt";

void func1()
{
vector<int> v;
int num;
ifstream fin(file_name);
while( fin >> num )
{
v.push_back(num);
}
cout <<"Number of values read" << v.size() << endl;
}

void func2()
{
vector<int> v;
v.reserve(42336000);
int num;

ifstream fin(file_name);
while( fin >> num )
{
v.push_back(num);
}
cout <<"Number of values read" << v.size() << endl;
}

void func3()
{
int *v = new int[42336000];
int num;

ifstream fin(file_name);
int i = 0;
while( fin >> num )
{
v[i++] = num;
}
cout <<"Number of values read" << i << endl;
delete [] v;
}

void func4()
{
int *v = new int[42336000];
FILE *f = fopen(file_name,"r");
int num;
int i = 0;
while(fscanf(f,"%d", &num) == 1)
{
v[i++] = num;
}
cout <<"Number of values read" << i << endl;
fclose(f);
delete [] v;
}

void func5()
{
int *v = new int[42336000];
int num = 0;

ifstream fin(file_name);
char buffer[8192];
int i = 0;
int bytes = 0;
char *p;
int hasnum = 0;
int eof = 0;
while(!eof)
{
fin.read(buffer, sizeof(buffer));
p = buffer;
bytes = 8192;
while(bytes > 0)
{
if (*p == 26) // End of file marker...
{
eof = 1;
break;
}
if (*p == '
' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout <<"Error..." << endl;
exit(1);
}
}
memset(buffer, 26, sizeof(buffer)); // To detect end of files.
}
cout <<"Number of values read" << i << endl;
delete [] v;
}

void func6()
{
int *v = new int[42336000];
int num = 0;

FILE *f = fopen(file_name,"r");
char buffer[8192];
int i = 0;
int bytes = 0;
char *p;
int hasnum = 0;
int eof = 0;
while(!eof)
{
fread(buffer, 1, sizeof(buffer), f);
p = buffer;
bytes = 8192;
while(bytes > 0)
{
if (*p == 26) // End of file marker...
{
eof = 1;
break;
}
if (*p == '
' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout <<"Error..." << endl;
exit(1);
}
}
memset(buffer, 26, sizeof(buffer)); // To detect end of files.
}
fclose(f);
cout <<"Number of values read" << i << endl;
delete [] v;
}

void func7()
{
int *v = new int[42336000];
int num = 0;

FILE *f = fopen(file_name,"r");
int ch;
int i = 0;
int hasnum = 0;
while((ch = fgetc(f)) != EOF)
{
if (ch == '
' || ch == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
hasnum = 0;
}
else if (ch >= '0' && ch <= '9')
{
hasnum = 1;
num *= 10;
num += ch-'0';
}
else
{
cout <<"Error..." << endl;
exit(1);
}
}
fclose(f);
cout <<"Number of values read" << i << endl;
delete [] v;
}

void func8()
{
int *v = new int[42336000];
int num = 0;

int f = open(file_name, O_RDONLY);

off_t size = lseek(f, 0, SEEK_END);
char *buffer = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, f, 0);

int i = 0;
int hasnum = 0;
int bytes = size;
char *p = buffer;
while(bytes > 0)
{
if (*p == '
' || *p == ' ')
{
if (hasnum)
v[i++] = num;
num = 0;
p++;
bytes--;
hasnum = 0;
}
else if (*p >= '0' && *p <= '9')
{
hasnum = 1;
num *= 10;
num += *p-'0';
p++;
bytes--;
}
else
{
cout <<"Error..." << endl;
exit(1);
}
}
close(f);
munmap(buffer, size);
cout <<"Number of values read" << i << endl;
delete [] v;
}

struct bm
{
void (*f)();
const char *name;
};

#define BM(f) { f, #f }

bm b[] =
{
BM(func1),
BM(func2),
BM(func3),
BM(func4),
BM(func5),
BM(func6),
BM(func7),
BM(func8),
};

double time_to_double(timeval *t)
{
return (t->tv_sec + (t->tv_usec/1000000.0)) * 1000.0;
}

double time_diff(timeval *t1, timeval *t2)
{
return time_to_double(t2) - time_to_double(t1);
}

int main()
{
for(int i = 0; i < sizeof(b) / sizeof(b[0]); i++)
{
timeval t1, t2;
gettimeofday(&t1, NULL);
b[i].f();
gettimeofday(&t2, NULL);
cout << b[i].name <<":" << time_diff(&t1, &t2) <<"ms" << endl;
}
for(int i = sizeof(b) / sizeof(b[0])-1; i >= 0; i--)
{
timeval t1, t2;
gettimeofday(&t1, NULL);
b[i].f();
gettimeofday(&t2, NULL);
cout << b[i].name <<":" << time_diff(&t1, &t2) <<"ms" << endl;
}
}

结果(连续两次运行，向前和向后，以避免文件缓存的好处)：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

Number of values read 42336000
func1: 6068.53ms
Number of values read 42336000
func2: 6421.47ms
Number of values read 42336000
func3: 5756.63ms
Number of values read 42336000
func4: 6947.56ms
Number of values read 42336000
func5: 941.081ms
Number of values read 42336000
func6: 962.831ms
Number of values read 42336000
func7: 2572.4ms
Number of values read 42336000
func8: 816.59ms
Number of values read 42336000
func8: 815.528ms
Number of values read 42336000
func7: 2578.6ms
Number of values read 42336000
func6: 948.185ms
Number of values read 42336000
func5: 932.139ms
Number of values read 42336000
func4: 6988.8ms
Number of values read 42336000
func3: 5750.03ms
Number of values read 42336000
func2: 6380.36ms
Number of values read 42336000
func1: 6050.45ms

总之，正如有人在评论中指出的那样，整数的实际解析在整个时间中都是相当重要的一部分，因此读取文件并不像我第一次指出的那样重要。即使是一种非常幼稚的文件读取方式(使用fgetc()比使用ifstream operator>>进行整数读取。

如图所示，使用mmap加载文件比通过fstream读取文件略快，但仅略快。

相关讨论

我不同意，这里有一些数字github.com/gizmoogwai/performance
我投了反对票，因为这类主张必须有参考资料。请提供，我将投票。
"读取大型文件的限制因素是磁盘传输速度"——这就是为什么谷歌通常处理分布在集群上的不超过64mgb的文件。
我添加了一个链接，指向以前对类似问题的回答，这是我上述陈述的基础。
@Matspetersson在您站点的基准测试中，将数据分解成行；他正在解析整数，这需要更多的CPU。我仍然认为限制因素将是磁盘带宽，但我只是猜测。(我的猜测是基于这样一个事实，即系统不可能缓存文件的重要部分，每次访问大约需要10毫秒，而且您可以在10毫秒内进行大量解析。)
@我看到你的号码了。同意你的意见。我试过你用getline和op(1)的方法，你用>>的方法快了两倍，但他们都"用ifstream"。因此，使用ifstream有不同的方法，这确实会有所不同。
好的，我在做一个小的基准测试，我有一个包含4200万个整数的文件，我只是为这个目的创建的。我将用code+结果发回。
@Matspetersson谢谢！我也会试试，大概一小时后公布结果。
@Matspeterson结果太棒了！再次感谢！以下是我的文本文件中函数1和函数8的结果：Number of values read 25000000 func8: 1232.18ms Number of values read 25000000 func1: 10449.7ms。
@Matspetersson：我推测解析比您预期的要大得多的原因是操作系统在内存中有用于所有测试的整个文件。实际上，操作系统正在为您"缓存"整个文件，因此您不会一直到磁盘。我敢打赌，如果你写一个尽可能多地写入内存的应用程序，并在测试之间运行它(以刷新OS缓存的文件)，那么磁盘IO会使这些都慢得多。
是的，我同意，如果文件是千兆字节的话，那就需要更多的时间。但我还是很惊讶要花多少时间来读取文件本身。我知道C++流不是超级优化的，但是这些数字让我很惊讶。

您可以使用外部排序对文件中的值进行排序，而不必将它们全部加载到内存中。排序速度将受到硬盘功能的限制，但您将能够处理真正巨大的文件。这是实现。

使用qt非常简单：

1
2
3
4
5
6
7
8
9
10
11
12
13

QFile file("h:/1.txt");
file.open(QIODevice::ReadOnly);
QDataStream in(&file);

QVector<int> ints;
ints.reserve(25000000);

while (!in.atEnd()) {
int integer;
qint8 line;
in >> integer >> line; // read an int into integer, a char into line
ints.append(integer); // append the integer to the vector
}

最后，您得到了可以轻松排序的intsqvector。行数与矢量大小相同，前提是文件格式正确。

在我的机器上，I7 [email protected] GHz，读取2500万个整数并将其放入矢量大约需要490毫秒。从普通机械硬盘读取，而不是从固态硬盘读取。

将整个文件缓冲到内存中并没有太大帮助，时间下降到420毫秒。

相关讨论

我会这样做：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

#include <fstream>
#include <iostream>
#include <string>

using namespace std;

int main() {

fstream file;
string line;
int intValue;
int lineCount = 0;
try {
file.open("myFile.txt", ios_base::in); // Open to read
while(getline(file, line)) {
lineCount++;
try {
intValue = stoi(line);
// Do something with your value
cout <<"Value for line" << lineCount <<" :" << intValue << endl;

} catch (const exception& e) {
cerr <<"Failed to convert line" << lineCount <<" to an int :" << e.what() << endl;
}
}
} catch (const exception& e) {
cerr << e.what() << endl;
if (file.is_open()) {
file.close();
}
}

cout <<"Line count :" << lineCount << endl;

system("PAUSE");
}

相关讨论

你不会说你是如何解读价值观的，所以很难说。不过，实际上只有两种解决方案："someistream

anIntandfscanf( someFd,"%d", &anInt )` Logically, these
should have similar performance, but implementations vary; it
might be worth trying and measuring both.

< /块引用>
另一件要检查的是你是如何存储它们的。如果你知道你有大约2500万，在在阅读之前，std::vector可能会有所帮助。它用3000万美元建造vector可能更便宜。元素，然后在看到结尾时修剪它，而不是使用push_back。
最后，您可以考虑编写一个immapstreambuf，以及把它用于mmap输入，直接从映射内存。或者甚至手动迭代，调用strtol(但这要花很多时间)；所有的流媒体解决方案最终可能会调用strtol或其他什么类似，但首先要围绕电话做重要的工作。
编辑：
我在家里的机器上做了一些很快的测试最近运行Linux的Lenova)，结果让我吃惊：

作为参考，我做了一些琐碎的事，不？ve实施，使用std::cin >> tmp和v.push_back( tmp );没有试图优化。在我的系统中，这只运行了不到10秒。

简单的优化，比如在向量上使用reserve，或者最初创建的向量大小为25000000，但没有变化很大，时间仍超过9秒。

使用一个非常简单的mmapstreambuf，时间下降到大约3秒，最简单的循环，没有reserve，等。

使用fscanf，时间下降到3秒以下。我怀疑FILE*的Linux实现也使用mmap(和std::filebuf没有)。

最后，使用一个mmapbuffer，与两个char*迭代，以及使用stdtol转换，时间降到一秒以下，

这些测试很快就完成了(不到一个小时的时间并运行所有这些程序)，而且远远不够严格(当然，不要告诉你任何关于其他环境的事情)，但是分歧让我吃惊。我没想到会有这么大的不同。

260MB没那么大。您应该能够将整个内容加载到内存中，然后对其进行解析。一旦进入，就可以使用嵌套循环读取行尾之间的整数，并使用常规函数进行转换。在开始之前，我会尝试为整数数组预先分配足够的内存。
哦，您可能会发现，对于类似的事情，原始的C样式文件访问功能是更快的选择。

一种可能的解决方案是将大文件分割成较小的块。分别对每个块进行排序，然后逐个合并所有已排序的块。
编辑：显然，这是一种成熟的方法。参见http://en.wikipedia.org/wiki/external_sorting上的"外部合并排序"

相关讨论

这几乎肯定会导致程序需要更多的总体时间。

尝试读取整数块并解析这些块，而不是逐行读取。

相关讨论

C++库将一次读取4K或更多的块。