I tried to use Node.js to process a 500MB Apache log file, converting its syntax from
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
to
ip.ip.ip.ip - - 02/Aug/2012:05:01:17 GET /path/of/access/ HTTP/1.1 302 26
, then write to another text file.
For better memory control and performance, I used fs.createReadStream and fs.createWriteStream, but only managed to write the first line into output.txt, because the script ends with an error:
{ [Error: EBADF, write] errno: 9, code: 'EBADF' }
Here I posted some info that may help debug.
Head of input.txt:
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
ip.ip.ip.ip - - [02/Aug/2012:05:01:18 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
Content of output.txt:
ip.ip.ip.ip - - [02/Aug/2012:05:01:17 -0600] "GET /path/of/access/ HTTP/1.1" 302 26
The whole script:
var fs = require('fs');
var data ='';
var n=0; //For line control
var r = fs.createReadStream('./input.txt',{
encoding: 'ascii',
start:0,
// end: 100000,
});
var w = fs.createWriteStream('./output.txt',{
encoding:'ascii'
});
function put(line){ //write into w;
++n;
w.write(line+'\n');
}
function end(){
r.destroy();
w.destroy();
}
function onData(chunk){
var hasNewline = chunk.indexOf('\n')!==-1;
if(hasNewline){
var arr = chunk.split('\n');
var first = arr.shift();
var last = arr.pop();
data+=first;
put(data); //write a complete line
arr.forEach(function(line){
put(line); //write a complete line
});
data=last;
}else{
data+=chunk;
}
if(n>100){
end();
}
}
function onErr(e){
console.log(e);
}
r.addListener( "data", onData);
r.addListener( "end", end);
r.addListener('error',onErr);
w.addListener('error',onErr);
You’ve got two issues that I can see.
The first is that your
endfunction callsdestroyon the ReadStream, but in the general case this is triggered from theendevent, which means that the stream is already closing, and it is going to calldestroyautomatically. That means thatr.destroyis going to be called twice, triggering an error. This is the cause of the error you are seeing printed.The second issue is that you are calling
destroyon the WriteStream. I suggest you go read the docs for that: http://nodejs.org/api/stream.html#stream_stream_destroy_1Specifically
Any queued write data will not be sent, which is why you are missing some of your output.Basically, you should ONLY call
destroyon the ReadStream if you want it to close early, like in yourn > 100case. Then you want to use WriteStream’sendinstead, so the stream has time to write all of the buffered data.Here is a simplified version, which I think should work the same. I’d also not bother binding
errorsince errors are automatically printed to the console anyway.