diff --git a/README.md b/README.md index ae779b4..a9c1886 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ This class is the main interface for reading data from dBase files. It extends ` The support options are: * encoding `String` The character encoding to use (default = `utf-8`) +* encoder `Function` The encoder for field value. (default `buffer.toString(encoding)` used) Creates a new Parser and attaches it to the specified filename. @@ -117,6 +118,23 @@ The following code example illustrates a very simple usage for this module: }); parser.parse(); + +How to use encodings not supported by nodejs Buffer: + + var Parser = require('node-dbf'); + var iconv = require('iconv-lite'); //npm install iconv-lite + + encodingFunction = function (buffer, encoding) { + return iconv.decode(buffer, 'CP866').trim(); //CP1252.... + }; + + var parser = new Parser('/path/to/my/dbase/file.dbf', {encoder:encodingFunction}); + + parser.on('record', function(record) { + console.log('Name: ' + record.firstName + ' ' + record.lastName); // Name: Jü Smith + }); + + parser.parse(); #Command-Line Interface (CLI) diff --git a/src/parser.coffee b/src/parser.coffee index 4614cc8..4588d75 100644 --- a/src/parser.coffee +++ b/src/parser.coffee @@ -6,6 +6,8 @@ class Parser extends EventEmitter constructor: (@filename, @options = {}) -> @encoding = @options?.encoding || 'utf-8' + @encoder = @options?.encoder || @getValueString + @readStreamOptions = @options?.readStreamOptions parse: => @emit 'start', @ @@ -16,13 +18,12 @@ class Parser extends EventEmitter @emit 'header', @header sequenceNumber = 0 - - loc = @header.start + loc = 0 bufLoc = @header.start overflow = null @paused = false - stream = fs.createReadStream @filename + stream = fs.createReadStream @filename, @readStreamOptions @readBuf = => @@ -31,18 +32,24 @@ class Parser extends EventEmitter return while buffer = stream.read() - if bufLoc isnt @header.start then bufLoc = 0 - if overflow isnt null then buffer = overflow + buffer - + if overflow isnt null then buffer = Buffer.concat [overflow, buffer] + while loc < (@header.start + @header.numberOfRecords * @header.recordLength) && (bufLoc + @header.recordLength) <= buffer.length @emit 'record', @parseRecord ++sequenceNumber, buffer.slice bufLoc, bufLoc += @header.recordLength - - loc += bufLoc - if bufLoc < buffer.length then overflow = buffer.slice bufLoc, buffer.length else overflow = null + + if bufLoc < buffer.length + overflow = buffer.slice bufLoc, buffer.length + loc += bufLoc + bufLoc = 0 + else + overflow = null + bufLoc -= buffer.length + loc += buffer.length + return @ - stream.on 'readable',@readBuf + stream.on 'readable',@readBuf stream.on 'end', () => @emit 'end' @@ -68,9 +75,12 @@ class Parser extends EventEmitter record[field.name] = @parseField field, buffer.slice loc, loc += field.length return record + + getValueString: (buffer, encoding) => + return (buffer.toString encoding).trim() parseField: (field, buffer) => - value = (buffer.toString @encoding).trim() + value = @encoder buffer, @encoding if field.type is 'N' value = parseInt value, 10 diff --git a/test/parser.js b/test/parser.js index ff88231..ca0e97c 100644 --- a/test/parser.js +++ b/test/parser.js @@ -210,7 +210,72 @@ describe('Parser', function() { expect(events.end).to.be.above(events.record); }); }); - + + describe('Parsing the SF zip codes with small chunksize and own encoder', function() { + var encoder_func_called = false; + var own_encoder = function (buffer, encoding) { encoder_func_called = true; return (buffer.toString(encoding)).trim();}; + + var parser = new Parser(__dirname + '/fixtures/bayarea_zipcodes.dbf', { encoder: own_encoder, readStreamOptions:{ highWaterMark:50}}); //read by 50 bytes chunk (below header size and below recordsize) + var header, records = [], events, header; + + + before(function(done) { + events = {start: undefined, header: undefined, record: undefined, end: undefined}; + + parser.on('start', function() { + events.start = process.hrtime()[1]; + }); + + parser.on('header', function(h) { + header = h; + events.header = process.hrtime()[1]; + }); + + parser.on('record', function(record) { + records.push(record); + events.record = process.hrtime()[1]; + }); + + parser.on('end', function() { + events.end = process.hrtime()[1]; + done(); + }); + + parser.parse(); + }); + + describe('the records', function() { + it('there are 187', function() { + expect(records).to.have.lengthOf(187); + }); + + it('the header says there are 187', function() { + expect(header.numberOfRecords).to.equal(187); + }); + + it('contain the 94111 zip code', function() { + var area = records.filter(function(v) { return '94111' === v.ZIP; }); + + expect(area).to.be.an('Array'); + expect(area).to.have.lengthOf(1); + area = area[0]; + + expect(area['@sequenceNumber']).to.be.a('Number'); + expect(area['@deleted']).to.equal(false); + + expect(area.ZIP).to.equal('94111'); + expect(area.PO_NAME).to.equal('SAN FRANCISCO'); + expect(area.STATE).to.equal('CA'); + expect(area.Area__).to.be.a('Number').within(0, Number.MAX_VALUE); + expect(area.Length__).to.be.a('Number').within(0, Number.MAX_VALUE); + }); + + it ('call encoder func', function () { + expect(encoder_func_called).to.be.true; + }); + }); + }) + // check a select number of them // check floats // check character encoding ???