Skip to content

Commit 569005b

Browse files
Refactor write function to handle large datasets
and implement size and token limits
1 parent c6b770a commit 569005b

File tree

1 file changed

+55
-7
lines changed

1 file changed

+55
-7
lines changed

src/core.ts

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ import { readFile, writeFile } from "fs/promises";
44
import { glob } from "glob";
55
import {Config, configSchema} from "./config.js";
66
import { Page } from "playwright";
7+
import {
8+
isWithinTokenLimit,
9+
} from 'gpt-tokenizer'
710

811
let pageCounter = 0;
912

@@ -113,17 +116,62 @@ export async function crawl(config: Config) {
113116
}
114117

115118
export async function write(config: Config) {
116-
configSchema.parse(config);
117-
118119
const jsonFiles = await glob("storage/datasets/default/*.json", {
119120
absolute: true,
120121
});
121122

122-
const results = [];
123+
console.log(`Found ${jsonFiles.length} files to combine...`);
124+
125+
let currentResults: any[] = [];
126+
let currentSize = 0;
127+
let fileCounter = 1;
128+
const maxBytes = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : null; // Convert maxFileSize from MB to bytes
129+
130+
// Helper function to get byte size of string
131+
const getStringByteSize = (str: string) => Buffer.byteLength(str, 'utf-8');
132+
133+
// Write the accumulated data to a file and reset the current batch
134+
const writeToFile = async () => {
135+
const fileName = `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;
136+
await writeFile(fileName, JSON.stringify(currentResults, null, 2));
137+
console.log(`Wrote ${currentResults.length} items to ${fileName}`);
138+
fileCounter++;
139+
currentResults = []; // Start a new batch
140+
currentSize = 0; // Reset the size counter
141+
};
142+
123143
for (const file of jsonFiles) {
124-
const data = JSON.parse(await readFile(file, "utf-8"));
125-
results.push(data);
126-
}
144+
const fileContent = await readFile(file, 'utf-8');
145+
const data = JSON.parse(fileContent);
146+
const dataSize = getStringByteSize(fileContent);
147+
let resultWritten = false;
148+
149+
// Check if data exceeds file size limit (if present)
150+
if (maxBytes && currentSize + dataSize > maxBytes) {
151+
await writeToFile();
152+
resultWritten = true;
153+
}
154+
155+
// Check if data exceeds token limit (if present)
156+
if (config.maxTokens && !isWithinTokenLimit(JSON.stringify(data), config.maxTokens)) {
157+
if (!resultWritten) { // Write only if not already written
158+
await writeToFile();
159+
}
160+
continue; // Skip adding this object to the batch
161+
}
162+
163+
// Add data to current batch
164+
currentResults.push(data);
165+
currentSize += dataSize;
127166

128-
await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
167+
// Write to file if batch is over size limit (File size check to delegate larger final batch size check)
168+
if (maxBytes && currentSize > maxBytes) {
169+
await writeToFile();
170+
}
171+
}
172+
173+
// Write any remaining data in the current batch to the final file
174+
if (currentResults.length > 0) {
175+
await writeToFile();
176+
}
129177
}

0 commit comments

Comments
 (0)