Parsing huge logfiles in Node.js - read in line-by-line
我需要在Javascript / Node.js中解析大型(5-10 Gb)日志文件(我正在使用Cube)。
日志看起来像:
1 | 10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state"SUCCESS". |
我们需要读取每一行,进行一些解析(例如剥离
首先,Node中逐行读取文件的规范方式是什么?
这似乎是在线相当常见的问题:
- http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
- 在node.js中一次读取一行文件?
许多答案似乎指向一堆第三方模块:
- https://github.com/nickewing/line-reader
- https://github.com/jahewson/node-byline
- https://github.com/pkrumins/node-lazy
- https://github.com/Gagle/Node-BufferedReader
但是,这似乎是一项相当基本的任务 - 当然,在stdlib中有一种简单的方法可以逐行读取文本文件吗?
其次,我需要处理每一行(例如,将时间戳转换为Date对象,并提取有用的字段)。
什么是最好的方法,最大化吞吐量?是否有某种方法不会阻止每行读取或将其发送到Cube?
第三 - 我猜测使用字符串拆分,并且JS等价的contains(IndexOf!= -1?)将比正则表达式快得多?有没有人在Node.js中解析大量文本数据方面有很多经验?
干杯,
胜利者
我搜索了一个解决方案,使用流逐行解析非常大的文件(gbs)。所有第三方库和示例都不符合我的需要,因为他们不是逐行处理文件(如1,2,3,4 ..)或将整个文件读取到内存中
以下解决方案可以使用stream& amp逐行解析非常大的文件。管。为了测试,我使用了一个带有17.000.000记录的2.1 GB文件。 Ram的使用量不超过60 mb。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | var fs = require('fs') , es = require('event-stream'); var lineNr = 0; var s = fs.createReadStream('very-large-file.csv') .pipe(es.split()) .pipe(es.mapSync(function(line){ // pause the readstream s.pause(); lineNr += 1; // process line here and call s.resume() when rdy // function below was for logging memory usage logMemoryUsage(lineNr); // resume the readstream, possibly from a callback s.resume(); }) .on('error', function(err){ console.log('Error while reading file.', err); }) .on('end', function(){ console.log('Read entire file.') }) ); |
请让我知道它是怎么回事!
您可以使用内置的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | var fs = require('fs'), readline = require('readline'), stream = require('stream'); var instream = fs.createReadStream('/path/to/file'); var outstream = new stream; outstream.readable = true; outstream.writable = true; var rl = readline.createInterface({ input: instream, output: outstream, terminal: false }); rl.on('line', function(line) { console.log(line); //Do your stuff ... //Then write to outstream rl.write(cubestuff); }); |
大文件需要一些时间来处理。告诉它是否有效。
我真的很喜欢@gerard的答案,这实际上应该是正确答案。我做了一些改进:
- 代码属于一个类(模块化)
- 解析包括在内
- 如果异步作业被链接到读取CSV(如插入到数据库)或HTTP请求,则会向外部提供恢复能力
-
阅读大块/批量大小
用户可以声明。我也在流中处理编码,以防万一
你有不同编码的文件。
这是代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | 'use strict' const fs = require('fs'), util = require('util'), stream = require('stream'), es = require('event-stream'), parse = require("csv-parse"), iconv = require('iconv-lite'); class CSVReader { constructor(filename, batchSize, columns) { this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8')) this.batchSize = batchSize || 1000 this.lineNumber = 0 this.data = [] this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true} } read(callback) { this.reader .pipe(es.split()) .pipe(es.mapSync(line => { ++this.lineNumber parse(line, this.parseOptions, (err, d) => { this.data.push(d[0]) }) if (this.lineNumber % this.batchSize === 0) { callback(this.data) } }) .on('error', function(){ console.log('Error while reading file.') }) .on('end', function(){ console.log('Read entirefile.') })) } continue () { this.data = [] this.reader.resume() } } module.exports = CSVReader |
所以基本上,这是你将如何使用它:
1 2 | let reader = CSVReader('path_to_file.csv') reader.read(() => reader.continue()) |
我用35GB的CSV文件对它进行了测试,它对我有用,这就是为什么我选择在@ gerard的答案上构建它,欢迎反馈。
我使用https://www.npmjs.com/package/line-by-line从文本文件中读取超过1 000 000行。在这种情况下,RAM的占用容量约为50-60兆字节。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | const LineByLineReader = require('line-by-line'), lr = new LineByLineReader('big_file.txt'); lr.on('error', function (err) { // 'err' contains error object }); lr.on('line', function (line) { // pause emitting of lines... lr.pause(); // ...do your asynchronous line processing.. setTimeout(function () { // ...and continue emitting lines. lr.resume(); }, 100); }); lr.on('end', function () { // All lines are read, file is closed now. }); |
除了逐行读取大文件外,您还可以通过块读取它。有关更多信息,请参阅本文
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | var offset = 0; var chunkSize = 2048; var chunkBuffer = new Buffer(chunkSize); var fp = fs.openSync('filepath', 'r'); var bytesRead = 0; while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) { offset += bytesRead; var str = chunkBuffer.slice(0, bytesRead).toString(); var arr = str.split(' '); if(bytesRead = chunkSize) { // the last item of the arr may be not a full line, leave it to the next chunk offset -= arr.pop().length; } lines.push(arr); } console.log(lines); |
我还有同样的问题。在比较几个似乎具有此功能的模块之后,我决定自己做,它比我想象的要简单。
要点:https://gist.github.com/deemstone/8279565
1 2 | var fetchBlock = lineByline(filepath, onEnd); fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No. |
它覆盖在闭包中打开的文件,
我为每次读操作设置了块大小为1024。这可能有错误,但代码逻辑很明显,请亲自尝试。
基于这个问题的答案,我实现了一个类,您可以使用它与
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | var fs = require('fs'); var Q = require('q'); var lr = new LineReader(filenameToLoad); lr.open(); var promise; workOnLine = function () { var line = lr.readNextLine(); promise = complexLineTransformation(line).then( function() {console.log('ok');workOnLine();}, function() {console.log('error');} ); } workOnLine(); complexLineTransformation = function (line) { var deferred = Q.defer(); // ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error)); return deferred.promise; } function LineReader (filename) { this.moreLinesAvailable = true; this.fd = undefined; this.bufferSize = 1024*1024; this.buffer = new Buffer(this.bufferSize); this.leftOver = ''; this.read = undefined; this.idxStart = undefined; this.idx = undefined; this.lineNumber = 0; this._bundleOfLines = []; this.open = function() { this.fd = fs.openSync(filename, 'r'); }; this.readNextLine = function () { if (this._bundleOfLines.length === 0) { this._readNextBundleOfLines(); } this.lineNumber++; var lineToReturn = this._bundleOfLines[0]; this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany) return lineToReturn; }; this.getLineNumber = function() { return this.lineNumber; }; this._readNextBundleOfLines = function() { var line =""; while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver this.idxStart = 0 while ((this.idx = this.leftOver.indexOf(" ", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver line = this.leftOver.substring(this.idxStart, this.idx); this._bundleOfLines.push(line); this.idxStart = this.idx + 1; } this.leftOver = this.leftOver.substring(this.idxStart); if (line !=="") { break; } } }; } |
node-byline使用流,所以我更喜欢那个庞大的文件。
对于你的日期转换,我会使用moment.js。
为了最大化您的吞吐量,您可以考虑使用软件集群。有一些很好的模块可以很好地包装节点本地集群模块。我喜欢isaacs的集群主人。例如你可以创建一个x工作者集群,它们都计算文件。
对于基准测试拆分与正则表达式使用benchmark.js。我还没有测试过它。 benchmark.js可用作节点模块
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import * as csv from 'fast-csv'; import * as fs from 'fs'; interface Row { [s: string]: string; } type RowCallBack = (data: Row, index: number) => object; export class CSVReader { protected file: string; protected csvOptions = { delimiter: ',', headers: true, ignoreEmpty: true, trim: true }; constructor(file: string, csvOptions = {}) { if (!fs.existsSync(file)) { throw new Error(`File ${file} not found.`); } this.file = file; this.csvOptions = Object.assign({}, this.csvOptions, csvOptions); } public read(callback: RowCallBack): Promise < Array < object >> { return new Promise < Array < object >> (resolve => { const readStream = fs.createReadStream(this.file); const results: Array < any > = []; let index = 0; const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => { index++; results.push(await callback(data, index)); }).on('error', (err: Error) => { console.error(err.message); throw err; }).on('end', () => { resolve(results); }); readStream.pipe(csvStream); }); } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import { CSVReader } from '../src/helpers/CSVReader'; (async () => { const reader = new CSVReader('./database/migrations/csv/users.csv'); const users = await reader.read(async data => { return { username: data.username, name: data.name, email: data.email, cellPhone: data.cell_phone, homePhone: data.home_phone, roleId: data.role_id, description: data.description, state: data.state, }; }); console.log(users); })(); |
我已经创建了一个节点模块来异步读取大文件文本或JSON。
测试大文件。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | var fs = require('fs') , util = require('util') , stream = require('stream') , es = require('event-stream'); module.exports = FileReader; function FileReader(){ } FileReader.prototype.read = function(pathToFile, callback){ var returnTxt = ''; var s = fs.createReadStream(pathToFile) .pipe(es.split()) .pipe(es.mapSync(function(line){ // pause the readstream s.pause(); //console.log('reading line: '+line); returnTxt += line; // resume the readstream, possibly from a callback s.resume(); }) .on('error', function(){ console.log('Error while reading file.'); }) .on('end', function(){ console.log('Read entire file.'); callback(returnTxt); }) ); }; FileReader.prototype.readJSON = function(pathToFile, callback){ try{ this.read(pathToFile, function(txt){callback(JSON.parse(txt));}); } catch(err){ throw new Error('json file is not valid! '+err.stack); } }; |
只需将文件保存为file-reader.js,然后像这样使用它:
1 2 3 | var FileReader = require('./file-reader'); var fileReader = new FileReader(); fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/}); |