@@ -115,63 +115,64 @@ export async function crawl(config: Config) {
115
115
}
116
116
}
117
117
118
- export async function write ( config : Config ) {
119
- const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
120
- absolute : true ,
121
- } ) ;
118
+ export async function write ( config : Config ) {
119
+ const jsonFiles = await glob ( "storage/datasets/default/*.json" , { absolute : true } ) ;
122
120
123
121
console . log ( `Found ${ jsonFiles . length } files to combine...` ) ;
124
122
125
- let currentResults : any [ ] = [ ] ;
126
- let currentSize = 0 ;
127
- let fileCounter = 1 ;
128
- const maxBytes = config . maxFileSize ? config . maxFileSize * 1024 * 1024 : null ; // Convert maxFileSize from MB to bytes
129
-
130
- // Helper function to get byte size of string
131
- const getStringByteSize = ( str : string ) => Buffer . byteLength ( str , 'utf-8' ) ;
132
-
133
- // Write the accumulated data to a file and reset the current batch
134
- const writeToFile = async ( ) => {
135
- const fileName = `${ config . outputFileName . replace ( / \. j s o n $ / , '' ) } -${ fileCounter } .json` ;
136
- await writeFile ( fileName , JSON . stringify ( currentResults , null , 2 ) ) ;
137
- console . log ( `Wrote ${ currentResults . length } items to ${ fileName } ` ) ;
123
+ let currentResults : Record < string , any > [ ] = [ ] ;
124
+ let currentSize : number = 0 ;
125
+ let fileCounter : number = 1 ;
126
+ const maxBytes : number = config . maxFileSize ? config . maxFileSize * 1024 * 1024 : Infinity ;
127
+
128
+ const getStringByteSize = ( str : string ) : number => Buffer . byteLength ( str , 'utf-8' ) ;
129
+
130
+ const nextFileName = ( ) : string => `${ config . outputFileName . replace ( / \. j s o n $ / , '' ) } -${ fileCounter } .json` ;
131
+
132
+ const writeBatchToFile = async ( ) : Promise < void > => {
133
+ await writeFile ( nextFileName ( ) , JSON . stringify ( currentResults , null , 2 ) ) ;
134
+ console . log ( `Wrote ${ currentResults . length } items to ${ nextFileName ( ) } ` ) ;
135
+ currentResults = [ ] ;
136
+ currentSize = 0 ;
138
137
fileCounter ++ ;
139
- currentResults = [ ] ; // Start a new batch
140
- currentSize = 0 ; // Reset the size counter
141
138
} ;
139
+
140
+ let estimatedTokens : number = 0 ;
142
141
143
- for ( const file of jsonFiles ) {
144
- const fileContent = await readFile ( file , 'utf-8' ) ;
145
- const data = JSON . parse ( fileContent ) ;
146
- const dataSize = getStringByteSize ( fileContent ) ;
147
- let resultWritten = false ;
148
-
149
- // Check if data exceeds file size limit (if present)
150
- if ( maxBytes && currentSize + dataSize > maxBytes ) {
151
- await writeToFile ( ) ;
152
- resultWritten = true ;
153
- }
142
+ const addContentOrSplit = async ( data : Record < string , any > ) : Promise < void > => {
143
+ const contentString : string = JSON . stringify ( data ) ;
144
+ const tokenCount : number | false = isWithinTokenLimit ( contentString , config . maxTokens || Infinity ) ;
154
145
155
- // Check if data exceeds token limit (if present)
156
- if ( config . maxTokens && ! isWithinTokenLimit ( JSON . stringify ( data ) , config . maxTokens ) ) {
157
- if ( ! resultWritten ) { // Write only if not already written
158
- await writeToFile ( ) ;
146
+ if ( typeof tokenCount === 'number' ) {
147
+ if ( estimatedTokens + tokenCount > config . maxTokens ! ) {
148
+ // Only write the batch if it's not empty (something to write)
149
+ if ( currentResults . length > 0 ) {
150
+ await writeBatchToFile ( ) ;
151
+ }
152
+ // Since the addition of a single item exceeded the token limit, halve it.
153
+ estimatedTokens = Math . floor ( tokenCount / 2 ) ;
154
+ currentResults . push ( data ) ;
155
+ } else {
156
+ currentResults . push ( data ) ;
157
+ estimatedTokens += tokenCount ;
159
158
}
160
- continue ; // Skip adding this object to the batch
161
159
}
162
160
163
- // Add data to current batch
164
- currentResults . push ( data ) ;
165
- currentSize += dataSize ;
166
-
167
- // Write to file if batch is over size limit (File size check to delegate larger final batch size check)
168
- if ( maxBytes && currentSize > maxBytes ) {
169
- await writeToFile ( ) ;
161
+ currentSize += getStringByteSize ( contentString ) ;
162
+ if ( currentSize > maxBytes ) {
163
+ await writeBatchToFile ( ) ;
170
164
}
165
+ } ;
166
+
167
+ // Iterate over each JSON file and process its contents.
168
+ for ( const file of jsonFiles ) {
169
+ const fileContent = await readFile ( file , 'utf-8' ) ;
170
+ const data : Record < string , any > = JSON . parse ( fileContent ) ;
171
+ await addContentOrSplit ( data ) ;
171
172
}
172
-
173
- // Write any remaining data in the current batch to the final file
173
+
174
+ // Check if any remaining data needs to be written to a file.
174
175
if ( currentResults . length > 0 ) {
175
- await writeToFile ( ) ;
176
+ await writeBatchToFile ( ) ;
176
177
}
177
- }
178
+ } ;
0 commit comments