Skip to content

Commit 05d497f

Browse files
Refactor write function to improve performance and
handle large datasets
1 parent bd86e59 commit 05d497f

File tree

1 file changed

+47
-46
lines changed

1 file changed

+47
-46
lines changed

src/core.ts

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -115,63 +115,64 @@ export async function crawl(config: Config) {
115115
}
116116
}
117117

118-
export async function write(config: Config) {
119-
const jsonFiles = await glob("storage/datasets/default/*.json", {
120-
absolute: true,
121-
});
118+
export async function write(config: Config) {
119+
const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true });
122120

123121
console.log(`Found ${jsonFiles.length} files to combine...`);
124122

125-
let currentResults: any[] = [];
126-
let currentSize = 0;
127-
let fileCounter = 1;
128-
const maxBytes = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : null; // Convert maxFileSize from MB to bytes
129-
130-
// Helper function to get byte size of string
131-
const getStringByteSize = (str: string) => Buffer.byteLength(str, 'utf-8');
132-
133-
// Write the accumulated data to a file and reset the current batch
134-
const writeToFile = async () => {
135-
const fileName = `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;
136-
await writeFile(fileName, JSON.stringify(currentResults, null, 2));
137-
console.log(`Wrote ${currentResults.length} items to ${fileName}`);
123+
let currentResults: Record<string, any>[] = [];
124+
let currentSize: number = 0;
125+
let fileCounter: number = 1;
126+
const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity;
127+
128+
const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8');
129+
130+
const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;
131+
132+
const writeBatchToFile = async (): Promise<void> => {
133+
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
134+
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
135+
currentResults = [];
136+
currentSize = 0;
138137
fileCounter++;
139-
currentResults = []; // Start a new batch
140-
currentSize = 0; // Reset the size counter
141138
};
139+
140+
let estimatedTokens: number = 0;
142141

143-
for (const file of jsonFiles) {
144-
const fileContent = await readFile(file, 'utf-8');
145-
const data = JSON.parse(fileContent);
146-
const dataSize = getStringByteSize(fileContent);
147-
let resultWritten = false;
148-
149-
// Check if data exceeds file size limit (if present)
150-
if (maxBytes && currentSize + dataSize > maxBytes) {
151-
await writeToFile();
152-
resultWritten = true;
153-
}
142+
const addContentOrSplit = async (data: Record<string, any>): Promise<void> => {
143+
const contentString: string = JSON.stringify(data);
144+
const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity);
154145

155-
// Check if data exceeds token limit (if present)
156-
if (config.maxTokens && !isWithinTokenLimit(JSON.stringify(data), config.maxTokens)) {
157-
if (!resultWritten) { // Write only if not already written
158-
await writeToFile();
146+
if (typeof tokenCount === 'number') {
147+
if (estimatedTokens + tokenCount > config.maxTokens!) {
148+
// Only write the batch if it's not empty (something to write)
149+
if (currentResults.length > 0) {
150+
await writeBatchToFile();
151+
}
152+
// Since the addition of a single item exceeded the token limit, halve it.
153+
estimatedTokens = Math.floor(tokenCount / 2);
154+
currentResults.push(data);
155+
} else {
156+
currentResults.push(data);
157+
estimatedTokens += tokenCount;
159158
}
160-
continue; // Skip adding this object to the batch
161159
}
162160

163-
// Add data to current batch
164-
currentResults.push(data);
165-
currentSize += dataSize;
166-
167-
// Write to file if batch is over size limit (File size check to delegate larger final batch size check)
168-
if (maxBytes && currentSize > maxBytes) {
169-
await writeToFile();
161+
currentSize += getStringByteSize(contentString);
162+
if (currentSize > maxBytes) {
163+
await writeBatchToFile();
170164
}
165+
};
166+
167+
// Iterate over each JSON file and process its contents.
168+
for (const file of jsonFiles) {
169+
const fileContent = await readFile(file, 'utf-8');
170+
const data: Record<string, any> = JSON.parse(fileContent);
171+
await addContentOrSplit(data);
171172
}
172-
173-
// Write any remaining data in the current batch to the final file
173+
174+
// Check if any remaining data needs to be written to a file.
174175
if (currentResults.length > 0) {
175-
await writeToFile();
176+
await writeBatchToFile();
176177
}
177-
}
178+
};

0 commit comments

Comments
 (0)