11#!/usr/bin/env tsx
22import * as readline from "node:readline" ;
33import * as path from "node:path" ;
4+ import * as fs from "node:fs" ;
45import {
56 enableOcr ,
67 extractFile ,
@@ -10,6 +11,32 @@ import {
1011 type ExtractionConfig ,
1112} from "@kreuzberg/wasm" ;
1213
14+ /** Default per-extraction timeout in milliseconds (5 minutes). */
15+ const DEFAULT_TIMEOUT_MS = 300_000 ;
16+
17+ /** Parse EXTRACTION_TIMEOUT_MS from env, falling back to the default. */
18+ const EXTRACTION_TIMEOUT_MS : number = ( ( ) => {
19+ const env = process . env [ "EXTRACTION_TIMEOUT_MS" ] ;
20+ if ( env ) {
21+ const parsed = Number . parseInt ( env , 10 ) ;
22+ if ( ! Number . isNaN ( parsed ) && parsed > 0 ) return parsed ;
23+ }
24+ return DEFAULT_TIMEOUT_MS ;
25+ } ) ( ) ;
26+
27+ /** Whether debug logging is enabled (BENCHMARK_DEBUG env var). */
28+ const DEBUG = ! ! process . env [ "BENCHMARK_DEBUG" ] ;
29+
30+ /** Running extraction counter for diagnostics. */
31+ let extractionCount = 0 ;
32+
33+ function log ( msg : string ) : void {
34+ if ( DEBUG ) {
35+ const mem = ( process . memoryUsage ( ) . rss / 1024 / 1024 ) . toFixed ( 0 ) ;
36+ process . stderr . write ( `[wasm:${ extractionCount } :${ mem } MB] ${ msg } \n` ) ;
37+ }
38+ }
39+
1340interface ExtractionOutput {
1441 content : string ;
1542 metadata : Record < string , unknown > ;
@@ -103,11 +130,50 @@ function createConfig(ocrEnabled: boolean, forceOcr?: boolean): ExtractionConfig
103130 } ;
104131}
105132
133+ /**
134+ * Race an extraction promise against a timeout.
135+ * On timeout, returns a rejected promise with a descriptive error including
136+ * the file path, MIME type, file size, and memory usage for debugging.
137+ */
138+ async function withTimeout < T > ( promise : Promise < T > , filePath : string , mimeType : string | null ) : Promise < T > {
139+ let timer : ReturnType < typeof setTimeout > ;
140+ const timeoutPromise = new Promise < never > ( ( _resolve , reject ) => {
141+ timer = setTimeout ( ( ) => {
142+ const fileSize = ( ( ) => {
143+ try {
144+ return fs . statSync ( filePath ) . size ;
145+ } catch {
146+ return - 1 ;
147+ }
148+ } ) ( ) ;
149+ const mem = process . memoryUsage ( ) ;
150+ reject (
151+ new Error (
152+ `Extraction timed out after ${ EXTRACTION_TIMEOUT_MS } ms` +
153+ ` | file=${ filePath } ` +
154+ ` | mime=${ mimeType ?? "unknown" } ` +
155+ ` | fileSize=${ fileSize } ` +
156+ ` | rss=${ mem . rss } ` +
157+ ` | heapUsed=${ mem . heapUsed } ` +
158+ ` | heapTotal=${ mem . heapTotal } ` +
159+ ` | extractionCount=${ extractionCount } ` ,
160+ ) ,
161+ ) ;
162+ } , EXTRACTION_TIMEOUT_MS ) ;
163+ } ) ;
164+
165+ try {
166+ return await Promise . race ( [ promise , timeoutPromise ] ) ;
167+ } finally {
168+ clearTimeout ( timer ! ) ;
169+ }
170+ }
171+
106172async function extractAsync ( filePath : string , ocrEnabled : boolean ) : Promise < ExtractionOutput > {
107173 const config = createConfig ( ocrEnabled ) ;
108174 const mimeType = guessMimeType ( filePath ) ;
109175 const start = performance . now ( ) ;
110- const result = await extractFile ( filePath , mimeType , config ) ;
176+ const result = await withTimeout ( extractFile ( filePath , mimeType , config ) , filePath , mimeType ) ;
111177 const durationMs = performance . now ( ) - start ;
112178
113179 const metadata = ( result . metadata as Record < string , unknown > ) ?? { } ;
@@ -123,7 +189,12 @@ async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<Extr
123189async function extractBatch ( filePaths : string [ ] , ocrEnabled : boolean ) : Promise < ExtractionOutput [ ] > {
124190 const config = createConfig ( ocrEnabled ) ;
125191 const start = performance . now ( ) ;
126- const results = await Promise . all ( filePaths . map ( ( fp ) => extractFile ( fp , guessMimeType ( fp ) , config ) ) ) ;
192+ const results = await Promise . all (
193+ filePaths . map ( ( fp ) => {
194+ const mime = guessMimeType ( fp ) ;
195+ return withTimeout ( extractFile ( fp , mime , config ) , fp , mime ) ;
196+ } ) ,
197+ ) ;
127198 const totalDurationMs = performance . now ( ) - start ;
128199
129200 const perFileDurationMs = filePaths . length > 0 ? totalDurationMs / filePaths . length : 0 ;
@@ -170,13 +241,19 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
170241 if ( ! filePath ) {
171242 continue ;
172243 }
244+
245+ extractionCount ++ ;
246+ const mimeType = guessMimeType ( filePath ) ;
247+ log ( `START ${ filePath } (mime=${ mimeType ?? "unknown" } )` ) ;
248+
173249 const start = performance . now ( ) ;
174250 try {
175251 const config = createConfig ( ocrEnabled || forceOcr , forceOcr ) ;
176- const mimeType = guessMimeType ( filePath ) ;
177- const result = await extractFile ( filePath , mimeType , config ) ;
252+ const result = await withTimeout ( extractFile ( filePath , mimeType , config ) , filePath , mimeType ) ;
178253 const durationMs = performance . now ( ) - start ;
179254
255+ log ( `OK ${ filePath } (${ durationMs . toFixed ( 1 ) } ms, ${ result . content . length } chars)` ) ;
256+
180257 const metadata = ( result . metadata as Record < string , unknown > ) ?? { } ;
181258 const payload : ExtractionOutput = {
182259 content : result . content ,
@@ -189,6 +266,10 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
189266 } catch ( err ) {
190267 const durationMs = performance . now ( ) - start ;
191268 const error = err as Error ;
269+
270+ // Always log failures to stderr (not gated on DEBUG) so CI logs capture them
271+ process . stderr . write ( `[wasm:ERROR] extraction #${ extractionCount } failed: ${ error . message } \n` ) ;
272+
192273 console . log (
193274 JSON . stringify ( {
194275 error : error . message ,
@@ -221,6 +302,8 @@ async function main(): Promise<void> {
221302 process . exit ( 1 ) ;
222303 }
223304
305+ process . stderr . write ( `[wasm] initializing (timeout=${ EXTRACTION_TIMEOUT_MS } ms, debug=${ DEBUG } )\n` ) ;
306+
224307 // Initialize WASM BEFORE timing measurement
225308 await initWasm ( ) ;
226309
@@ -231,20 +314,24 @@ async function main(): Promise<void> {
231314 if ( wasmModule ) {
232315 try {
233316 await initializePdfiumAsync ( wasmModule ) ;
317+ process . stderr . write ( "[wasm] PDFium initialized\n" ) ;
234318 } catch {
235- // PDFium not available — PDF extraction will be disabled
319+ process . stderr . write ( "[wasm] PDFium not available — PDF extraction disabled\n" ) ;
236320 }
237321 }
238322
239323 // Enable OCR backend when requested (required for image extraction)
240324 if ( ocrEnabled ) {
241325 try {
242326 await enableOcr ( ) ;
327+ process . stderr . write ( "[wasm] OCR enabled\n" ) ;
243328 } catch ( err ) {
244- console . error ( " Failed to enable OCR:" , ( err as Error ) . message ) ;
329+ process . stderr . write ( `[wasm] Failed to enable OCR: ${ ( err as Error ) . message } \n` ) ;
245330 }
246331 }
247332
333+ process . stderr . write ( "[wasm] initialization complete\n" ) ;
334+
248335 const mode = args [ 0 ] ;
249336 const filePaths = args . slice ( 1 ) ;
250337
0 commit comments