1- import { statSync } from "fs" ;
1+ import { stat } from "fs/promises " ;
22import { createReadStream } from "fs-extra" ;
3- import { createInterface } from "readline" ;
3+
4+ const doubleLineBreakRegexp = / \n \r ? \n / ;
45
56/**
67 * Read a file consisting of multiple JSON objects. Each object is separated from the previous one
@@ -14,53 +15,40 @@ export async function readJsonlFile<T>(
1415 handler : ( value : T ) => Promise < void > ,
1516 logger ?: { log : ( message : string ) => void } ,
1617) : Promise < void > {
17- function parseJsonFromCurrentLines ( ) {
18- try {
19- return JSON . parse ( currentLineSequence . join ( "\n" ) ) as T ;
20- } catch ( e ) {
21- void logger ?. log (
22- // eslint-disable-next-line @typescript-eslint/no-explicit-any
23- `Error: Failed to parse at line ${ lineCount } of ${ path } as JSON: ${ ( e as any ) ?. message ?? "UNKNOWN REASON" } . Problematic line below:\n${ JSON . stringify ( currentLineSequence , null , 2 ) } ` ,
24- ) ;
25- throw e ;
26- }
27- }
28-
29- function logProgress ( ) {
30- void logger ?. log (
31- `Processed ${ lineCount } lines with ${ parseCounts } parses...` ,
32- ) ;
33- }
34-
3518 void logger ?. log (
36- `Parsing ${ path } (${ statSync ( path ) . size / 1024 / 1024 } MB)...` ,
19+ `Parsing ${ path } (${ ( await stat ( path ) ) . size / 1024 / 1024 } MB)...` ,
3720 ) ;
38- const fileStream = createReadStream ( path , "utf8" ) ;
39- const rl = createInterface ( {
40- input : fileStream ,
41- crlfDelay : Infinity ,
21+ return new Promise ( ( resolve , reject ) => {
22+ const stream = createReadStream ( path , { encoding : "utf8" } ) ;
23+ let buffer = "" ;
24+ stream . on ( "data" , async ( chunk : string ) => {
25+ const parts = ( buffer + chunk ) . split ( doubleLineBreakRegexp ) ;
26+ buffer = parts . pop ( ) ! ;
27+ if ( parts . length > 0 ) {
28+ try {
29+ stream . pause ( ) ;
30+ for ( const part of parts ) {
31+ await handler ( JSON . parse ( part ) ) ;
32+ }
33+ stream . resume ( ) ;
34+ } catch ( e ) {
35+ stream . destroy ( ) ;
36+ reject ( e ) ;
37+ }
38+ }
39+ } ) ;
40+ stream . on ( "end" , async ( ) => {
41+ if ( buffer . trim ( ) . length > 0 ) {
42+ try {
43+ await handler ( JSON . parse ( buffer ) ) ;
44+ } catch ( e ) {
45+ reject ( e ) ;
46+ return ;
47+ }
48+ }
49+ void logger ?. log ( `Finishing parsing ${ path } ` ) ;
50+ resolve ( ) ;
51+ } ) ;
52+ stream . on ( "error" , reject ) ;
4253 } ) ;
43-
44- let lineCount = 0 ;
45- let parseCounts = 0 ;
46- let currentLineSequence : string [ ] = [ ] ;
47- for await ( const line of rl ) {
48- if ( line === "" ) {
49- // as mentioned above: a double newline sequence indicates the end of the current JSON object, so we parse it and pass it to the handler
50- await handler ( parseJsonFromCurrentLines ( ) ) ;
51- parseCounts ++ ;
52- currentLineSequence = [ ] ;
53- } else {
54- currentLineSequence . push ( line ) ;
55- }
56- lineCount ++ ;
57- if ( lineCount % 1000000 === 0 ) {
58- logProgress ( ) ;
59- }
60- }
61- // in case the file is not newline-terminated, we need to handle the last JSON object
62- if ( currentLineSequence . length > 0 ) {
63- await handler ( parseJsonFromCurrentLines ( ) ) ;
64- }
65- logProgress ( ) ;
6654}
0 commit comments