@@ -4,6 +4,9 @@ import { readFile, writeFile } from "fs/promises";
4
4
import { glob } from "glob" ;
5
5
import { Config , configSchema } from "./config.js" ;
6
6
import { Page } from "playwright" ;
7
+ import {
8
+ isWithinTokenLimit ,
9
+ } from 'gpt-tokenizer'
7
10
8
11
let pageCounter = 0 ;
9
12
@@ -113,17 +116,62 @@ export async function crawl(config: Config) {
113
116
}
114
117
115
118
export async function write ( config : Config ) {
116
- configSchema . parse ( config ) ;
117
-
118
119
const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
119
120
absolute : true ,
120
121
} ) ;
121
122
122
- const results = [ ] ;
123
+ console . log ( `Found ${ jsonFiles . length } files to combine...` ) ;
124
+
125
+ let currentResults : any [ ] = [ ] ;
126
+ let currentSize = 0 ;
127
+ let fileCounter = 1 ;
128
+ const maxBytes = config . maxFileSize ? config . maxFileSize * 1024 * 1024 : null ; // Convert maxFileSize from MB to bytes
129
+
130
+ // Helper function to get byte size of string
131
+ const getStringByteSize = ( str : string ) => Buffer . byteLength ( str , 'utf-8' ) ;
132
+
133
+ // Write the accumulated data to a file and reset the current batch
134
+ const writeToFile = async ( ) => {
135
+ const fileName = `${ config . outputFileName . replace ( / \. j s o n $ / , '' ) } -${ fileCounter } .json` ;
136
+ await writeFile ( fileName , JSON . stringify ( currentResults , null , 2 ) ) ;
137
+ console . log ( `Wrote ${ currentResults . length } items to ${ fileName } ` ) ;
138
+ fileCounter ++ ;
139
+ currentResults = [ ] ; // Start a new batch
140
+ currentSize = 0 ; // Reset the size counter
141
+ } ;
142
+
123
143
for ( const file of jsonFiles ) {
124
- const data = JSON . parse ( await readFile ( file , "utf-8" ) ) ;
125
- results . push ( data ) ;
126
- }
144
+ const fileContent = await readFile ( file , 'utf-8' ) ;
145
+ const data = JSON . parse ( fileContent ) ;
146
+ const dataSize = getStringByteSize ( fileContent ) ;
147
+ let resultWritten = false ;
148
+
149
+ // Check if data exceeds file size limit (if present)
150
+ if ( maxBytes && currentSize + dataSize > maxBytes ) {
151
+ await writeToFile ( ) ;
152
+ resultWritten = true ;
153
+ }
154
+
155
+ // Check if data exceeds token limit (if present)
156
+ if ( config . maxTokens && ! isWithinTokenLimit ( JSON . stringify ( data ) , config . maxTokens ) ) {
157
+ if ( ! resultWritten ) { // Write only if not already written
158
+ await writeToFile ( ) ;
159
+ }
160
+ continue ; // Skip adding this object to the batch
161
+ }
162
+
163
+ // Add data to current batch
164
+ currentResults . push ( data ) ;
165
+ currentSize += dataSize ;
127
166
128
- await writeFile ( config . outputFileName , JSON . stringify ( results , null , 2 ) ) ;
167
+ // Write to file if batch is over size limit (File size check to delegate larger final batch size check)
168
+ if ( maxBytes && currentSize > maxBytes ) {
169
+ await writeToFile ( ) ;
170
+ }
171
+ }
172
+
173
+ // Write any remaining data in the current batch to the final file
174
+ if ( currentResults . length > 0 ) {
175
+ await writeToFile ( ) ;
176
+ }
129
177
}
0 commit comments