@@ -4,9 +4,7 @@ import { readFile, writeFile } from "fs/promises";
4
4
import { glob } from "glob" ;
5
5
import { Config , configSchema } from "./config.js" ;
6
6
import { Page } from "playwright" ;
7
- import {
8
- isWithinTokenLimit ,
9
- } from 'gpt-tokenizer'
7
+ import { isWithinTokenLimit } from "gpt-tokenizer" ;
10
8
11
9
let pageCounter = 0 ;
12
10
@@ -144,35 +142,46 @@ export async function crawl(config: Config) {
144
142
}
145
143
}
146
144
147
- export async function write ( config : Config ) {
148
- const jsonFiles = await glob ( "storage/datasets/default/*.json" , { absolute : true } ) ;
145
+ export async function write ( config : Config ) {
146
+ const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
147
+ absolute : true ,
148
+ } ) ;
149
149
150
150
console . log ( `Found ${ jsonFiles . length } files to combine...` ) ;
151
151
152
152
let currentResults : Record < string , any > [ ] = [ ] ;
153
153
let currentSize : number = 0 ;
154
154
let fileCounter : number = 1 ;
155
- const maxBytes : number = config . maxFileSize ? config . maxFileSize * 1024 * 1024 : Infinity ;
156
-
157
- const getStringByteSize = ( str : string ) : number => Buffer . byteLength ( str , 'utf-8' ) ;
158
-
159
- const nextFileName = ( ) : string => `${ config . outputFileName . replace ( / \. j s o n $ / , '' ) } -${ fileCounter } .json` ;
160
-
155
+ const maxBytes : number = config . maxFileSize
156
+ ? config . maxFileSize * 1024 * 1024
157
+ : Infinity ;
158
+
159
+ const getStringByteSize = ( str : string ) : number =>
160
+ Buffer . byteLength ( str , "utf-8" ) ;
161
+
162
+ const nextFileName = ( ) : string =>
163
+ `${ config . outputFileName . replace ( / \. j s o n $ / , "" ) } -${ fileCounter } .json` ;
164
+
161
165
const writeBatchToFile = async ( ) : Promise < void > => {
162
166
await writeFile ( nextFileName ( ) , JSON . stringify ( currentResults , null , 2 ) ) ;
163
167
console . log ( `Wrote ${ currentResults . length } items to ${ nextFileName ( ) } ` ) ;
164
168
currentResults = [ ] ;
165
169
currentSize = 0 ;
166
170
fileCounter ++ ;
167
171
} ;
168
-
172
+
169
173
let estimatedTokens : number = 0 ;
170
174
171
- const addContentOrSplit = async ( data : Record < string , any > ) : Promise < void > => {
175
+ const addContentOrSplit = async (
176
+ data : Record < string , any > ,
177
+ ) : Promise < void > => {
172
178
const contentString : string = JSON . stringify ( data ) ;
173
- const tokenCount : number | false = isWithinTokenLimit ( contentString , config . maxTokens || Infinity ) ;
179
+ const tokenCount : number | false = isWithinTokenLimit (
180
+ contentString ,
181
+ config . maxTokens || Infinity ,
182
+ ) ;
174
183
175
- if ( typeof tokenCount === ' number' ) {
184
+ if ( typeof tokenCount === " number" ) {
176
185
if ( estimatedTokens + tokenCount > config . maxTokens ! ) {
177
186
// Only write the batch if it's not empty (something to write)
178
187
if ( currentResults . length > 0 ) {
@@ -195,7 +204,7 @@ export async function write(config: Config) {
195
204
196
205
// Iterate over each JSON file and process its contents.
197
206
for ( const file of jsonFiles ) {
198
- const fileContent = await readFile ( file , ' utf-8' ) ;
207
+ const fileContent = await readFile ( file , " utf-8" ) ;
199
208
const data : Record < string , any > = JSON . parse ( fileContent ) ;
200
209
await addContentOrSplit ( data ) ;
201
210
}
@@ -204,4 +213,4 @@ export async function write(config: Config) {
204
213
if ( currentResults . length > 0 ) {
205
214
await writeBatchToFile ( ) ;
206
215
}
207
- } ;
216
+ }
0 commit comments