Skip to content

Commit 310e9d5

Browse files
committed
fix(bench): add per-extraction timeout and debug logging to WASM benchmark script
The WASM persistent process was hanging indefinitely in CI on certain extractions, causing 900s timeouts and 5+ hour benchmark runs. Add a per-extraction timeout (default 5min, configurable via EXTRACTION_TIMEOUT_MS) using Promise.race so the process self-reports timeout errors as JSON instead of going silent. Add debug logging (BENCHMARK_DEBUG) with extraction count, memory usage, file path, and MIME type. Error logging is always on so CI logs capture which files fail and why.
1 parent db5a804 commit 310e9d5

File tree

1 file changed

+93
-6
lines changed

1 file changed

+93
-6
lines changed

tools/benchmark-harness/scripts/kreuzberg_extract_wasm.ts

Lines changed: 93 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env tsx
22
import * as readline from "node:readline";
33
import * as path from "node:path";
4+
import * as fs from "node:fs";
45
import {
56
enableOcr,
67
extractFile,
@@ -10,6 +11,32 @@ import {
1011
type ExtractionConfig,
1112
} from "@kreuzberg/wasm";
1213

14+
/** Default per-extraction timeout in milliseconds (5 minutes). */
15+
const DEFAULT_TIMEOUT_MS = 300_000;
16+
17+
/** Parse EXTRACTION_TIMEOUT_MS from env, falling back to the default. */
18+
const EXTRACTION_TIMEOUT_MS: number = (() => {
19+
const env = process.env["EXTRACTION_TIMEOUT_MS"];
20+
if (env) {
21+
const parsed = Number.parseInt(env, 10);
22+
if (!Number.isNaN(parsed) && parsed > 0) return parsed;
23+
}
24+
return DEFAULT_TIMEOUT_MS;
25+
})();
26+
27+
/** Whether debug logging is enabled (BENCHMARK_DEBUG env var). */
28+
const DEBUG = !!process.env["BENCHMARK_DEBUG"];
29+
30+
/** Running extraction counter for diagnostics. */
31+
let extractionCount = 0;
32+
33+
function log(msg: string): void {
34+
if (DEBUG) {
35+
const mem = (process.memoryUsage().rss / 1024 / 1024).toFixed(0);
36+
process.stderr.write(`[wasm:${extractionCount}:${mem}MB] ${msg}\n`);
37+
}
38+
}
39+
1340
interface ExtractionOutput {
1441
content: string;
1542
metadata: Record<string, unknown>;
@@ -103,11 +130,50 @@ function createConfig(ocrEnabled: boolean, forceOcr?: boolean): ExtractionConfig
103130
};
104131
}
105132

133+
/**
134+
* Race an extraction promise against a timeout.
135+
* On timeout, returns a rejected promise with a descriptive error including
136+
* the file path, MIME type, file size, and memory usage for debugging.
137+
*/
138+
async function withTimeout<T>(promise: Promise<T>, filePath: string, mimeType: string | null): Promise<T> {
139+
let timer: ReturnType<typeof setTimeout>;
140+
const timeoutPromise = new Promise<never>((_resolve, reject) => {
141+
timer = setTimeout(() => {
142+
const fileSize = (() => {
143+
try {
144+
return fs.statSync(filePath).size;
145+
} catch {
146+
return -1;
147+
}
148+
})();
149+
const mem = process.memoryUsage();
150+
reject(
151+
new Error(
152+
`Extraction timed out after ${EXTRACTION_TIMEOUT_MS}ms` +
153+
` | file=${filePath}` +
154+
` | mime=${mimeType ?? "unknown"}` +
155+
` | fileSize=${fileSize}` +
156+
` | rss=${mem.rss}` +
157+
` | heapUsed=${mem.heapUsed}` +
158+
` | heapTotal=${mem.heapTotal}` +
159+
` | extractionCount=${extractionCount}`,
160+
),
161+
);
162+
}, EXTRACTION_TIMEOUT_MS);
163+
});
164+
165+
try {
166+
return await Promise.race([promise, timeoutPromise]);
167+
} finally {
168+
clearTimeout(timer!);
169+
}
170+
}
171+
106172
async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<ExtractionOutput> {
107173
const config = createConfig(ocrEnabled);
108174
const mimeType = guessMimeType(filePath);
109175
const start = performance.now();
110-
const result = await extractFile(filePath, mimeType, config);
176+
const result = await withTimeout(extractFile(filePath, mimeType, config), filePath, mimeType);
111177
const durationMs = performance.now() - start;
112178

113179
const metadata = (result.metadata as Record<string, unknown>) ?? {};
@@ -123,7 +189,12 @@ async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<Extr
123189
async function extractBatch(filePaths: string[], ocrEnabled: boolean): Promise<ExtractionOutput[]> {
124190
const config = createConfig(ocrEnabled);
125191
const start = performance.now();
126-
const results = await Promise.all(filePaths.map((fp) => extractFile(fp, guessMimeType(fp), config)));
192+
const results = await Promise.all(
193+
filePaths.map((fp) => {
194+
const mime = guessMimeType(fp);
195+
return withTimeout(extractFile(fp, mime, config), fp, mime);
196+
}),
197+
);
127198
const totalDurationMs = performance.now() - start;
128199

129200
const perFileDurationMs = filePaths.length > 0 ? totalDurationMs / filePaths.length : 0;
@@ -170,13 +241,19 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
170241
if (!filePath) {
171242
continue;
172243
}
244+
245+
extractionCount++;
246+
const mimeType = guessMimeType(filePath);
247+
log(`START ${filePath} (mime=${mimeType ?? "unknown"})`);
248+
173249
const start = performance.now();
174250
try {
175251
const config = createConfig(ocrEnabled || forceOcr, forceOcr);
176-
const mimeType = guessMimeType(filePath);
177-
const result = await extractFile(filePath, mimeType, config);
252+
const result = await withTimeout(extractFile(filePath, mimeType, config), filePath, mimeType);
178253
const durationMs = performance.now() - start;
179254

255+
log(`OK ${filePath} (${durationMs.toFixed(1)}ms, ${result.content.length} chars)`);
256+
180257
const metadata = (result.metadata as Record<string, unknown>) ?? {};
181258
const payload: ExtractionOutput = {
182259
content: result.content,
@@ -189,6 +266,10 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
189266
} catch (err) {
190267
const durationMs = performance.now() - start;
191268
const error = err as Error;
269+
270+
// Always log failures to stderr (not gated on DEBUG) so CI logs capture them
271+
process.stderr.write(`[wasm:ERROR] extraction #${extractionCount} failed: ${error.message}\n`);
272+
192273
console.log(
193274
JSON.stringify({
194275
error: error.message,
@@ -221,6 +302,8 @@ async function main(): Promise<void> {
221302
process.exit(1);
222303
}
223304

305+
process.stderr.write(`[wasm] initializing (timeout=${EXTRACTION_TIMEOUT_MS}ms, debug=${DEBUG})\n`);
306+
224307
// Initialize WASM BEFORE timing measurement
225308
await initWasm();
226309

@@ -231,20 +314,24 @@ async function main(): Promise<void> {
231314
if (wasmModule) {
232315
try {
233316
await initializePdfiumAsync(wasmModule);
317+
process.stderr.write("[wasm] PDFium initialized\n");
234318
} catch {
235-
// PDFium not available — PDF extraction will be disabled
319+
process.stderr.write("[wasm] PDFium not available — PDF extraction disabled\n");
236320
}
237321
}
238322

239323
// Enable OCR backend when requested (required for image extraction)
240324
if (ocrEnabled) {
241325
try {
242326
await enableOcr();
327+
process.stderr.write("[wasm] OCR enabled\n");
243328
} catch (err) {
244-
console.error("Failed to enable OCR:", (err as Error).message);
329+
process.stderr.write(`[wasm] Failed to enable OCR: ${(err as Error).message}\n`);
245330
}
246331
}
247332

333+
process.stderr.write("[wasm] initialization complete\n");
334+
248335
const mode = args[0];
249336
const filePaths = args.slice(1);
250337

0 commit comments

Comments
 (0)