fix: benchmark BMP false positives and OCR status tracking

Goldziher · Goldziher · commit 3cdfb0823d72 · 2026-02-12T07:06:58.000+01:00
- Replace synthetic no-text BMP fixture with text-containing image
  converted from test_hello_world.png for proper OCR benchmarking
- Add empty content check to batch adapter success determination,
  matching the existing single-mode behavior
- Add _ocr_used field to all 7 binding extraction scripts (Python,
  Ruby, TypeScript, Elixir, PHP, Go, WASM) so the subprocess adapter
  correctly buckets results as with_ocr vs no_ocr
diff --git a/test_documents/images/bmp_24.bmp b/test_documents/images/bmp_24.bmp
diff --git a/tools/benchmark-harness/fixtures/image_bmp.json b/tools/benchmark-harness/fixtures/image_bmp.json
@@ -1,12 +1,12 @@
 {
   "document": "../../../test_documents/images/bmp_24.bmp",
   "file_type": "bmp",
-  "file_size": 120054,
+  "file_size": 480054,
   "expected_frameworks": [
     "kreuzberg"
   ],
   "metadata": {
-    "description": "24-bit BMP image file for format coverage testing",
+    "description": "24-bit BMP image with 'Hello World' text for OCR testing",
     "category": "image",
     "size_class": "small"
   }
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract.exs b/tools/benchmark-harness/scripts/kreuzberg_extract.exs
@@ -44,7 +44,7 @@ defmodule KreuzbergExtract do
   @doc """
   Extract a single file synchronously.
   """
-  def extract_sync(file_path, config \\ %{}) do
+  def extract_sync(file_path, config \\ %{}, ocr_enabled \\ false) do
     debug_log("=== SYNC EXTRACTION START ===")
     debug_log("Input: file_path=#{file_path}")
     debug_log("File exists: #{File.exists?(file_path)}")
@@ -78,7 +78,8 @@ defmodule KreuzbergExtract do
         payload = %{
           "content" => extraction_result.content,
           "metadata" => struct_to_map(extraction_result.metadata),
-          "_extraction_time_ms" => duration_ms
+          "_extraction_time_ms" => duration_ms,
+          "_ocr_used" => ocr_enabled
         }
 
         json_size = payload |> Jason.encode!() |> byte_size()
@@ -95,7 +96,7 @@ defmodule KreuzbergExtract do
   @doc """
   Extract multiple files in batch mode.
   """
-  def extract_batch(file_paths, config \\ %{}) do
+  def extract_batch(file_paths, config \\ %{}, ocr_enabled \\ false) do
     debug_log("=== BATCH EXTRACTION START ===")
     debug_log("Input: #{length(file_paths)} files")
 
@@ -144,7 +145,8 @@ defmodule KreuzbergExtract do
               "content" => extraction_result.content,
               "metadata" => struct_to_map(extraction_result.metadata),
               "_extraction_time_ms" => per_file_duration_ms,
-              "_batch_total_ms" => total_duration_ms
+              "_batch_total_ms" => total_duration_ms,
+              "_ocr_used" => ocr_enabled
             }
           end)
 
@@ -160,7 +162,7 @@ defmodule KreuzbergExtract do
   @doc """
   Server mode: read paths from stdin, write JSON to stdout.
   """
-  def run_server(config \\ %{}) do
+  def run_server(config \\ %{}, ocr_enabled \\ false) do
     debug_log("=== SERVER MODE START ===")
 
     # Signal readiness after BEAM VM + NIF initialization
@@ -173,7 +175,7 @@ defmodule KreuzbergExtract do
       debug_log("Processing file: #{file_path}")
 
       try do
-        case extract_sync(file_path, config) do
+        case extract_sync(file_path, config, ocr_enabled) do
           {:ok, payload} ->
             json = Jason.encode!(payload)
             IO.write(json)
@@ -182,7 +184,8 @@ defmodule KreuzbergExtract do
           {:error, reason} ->
             error_payload = %{
               "error" => inspect(reason),
-              "_extraction_time_ms" => 0
+              "_extraction_time_ms" => 0,
+              "_ocr_used" => ocr_enabled
             }
 
             json = Jason.encode!(error_payload)
@@ -193,7 +196,8 @@ defmodule KreuzbergExtract do
         e ->
           error_payload = %{
             "error" => inspect(e),
-            "_extraction_time_ms" => 0
+            "_extraction_time_ms" => 0,
+            "_ocr_used" => ocr_enabled
           }
 
           json = Jason.encode!(error_payload)
@@ -246,7 +250,7 @@ defmodule KreuzbergExtract do
         case mode do
           "server" ->
             debug_log("Executing server mode")
-            run_server(config)
+            run_server(config, ocr_enabled)
 
           "sync" ->
             if length(file_paths) != 1 do
@@ -256,7 +260,7 @@ defmodule KreuzbergExtract do
 
             debug_log("Executing sync mode with file: #{hd(file_paths)}")
 
-            case extract_sync(hd(file_paths), config) do
+            case extract_sync(hd(file_paths), config, ocr_enabled) do
               {:ok, payload} ->
                 json = Jason.encode!(payload)
                 debug_log("Output JSON: #{json}")
@@ -275,7 +279,7 @@ defmodule KreuzbergExtract do
 
             debug_log("Executing batch mode with #{length(file_paths)} files")
 
-            case extract_batch(file_paths, config) do
+            case extract_batch(file_paths, config, ocr_enabled) do
               {:ok, results} ->
                 json =
                   if length(file_paths) == 1 do
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract.php b/tools/benchmark-harness/scripts/kreuzberg_extract.php
@@ -49,7 +49,7 @@ function debug_log(string $message): void
 /**
  * Extract a single file synchronously
  */
-function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
+function extract_sync(string $filePath, ?ExtractionConfig $config = null, bool $ocrEnabled = false): array
 {
     debug_log("=== SYNC EXTRACTION START ===");
     debug_log("Input: file_path={$filePath}");
@@ -82,6 +82,7 @@ function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
         'content' => $result->content,
         'metadata' => $result->metadata ?? [],
         '_extraction_time_ms' => $durationMs,
+        '_ocr_used' => $ocrEnabled,
     ];
 
     debug_log("Output JSON size: " . strlen(json_encode($payload)) . " bytes");
@@ -93,7 +94,7 @@ function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
 /**
  * Extract multiple files in batch
  */
-function extract_batch(array $filePaths, ?ExtractionConfig $config = null): array
+function extract_batch(array $filePaths, ?ExtractionConfig $config = null, bool $ocrEnabled = false): array
 {
     debug_log("=== BATCH EXTRACTION START ===");
     debug_log("Input: " . count($filePaths) . " files");
@@ -132,6 +133,7 @@ function extract_batch(array $filePaths, ?ExtractionConfig $config = null): arra
             'metadata' => $result->metadata ?? [],
             '_extraction_time_ms' => $perFileDurationMs,
             '_batch_total_ms' => $totalDurationMs,
+            '_ocr_used' => $ocrEnabled,
         ];
     }
 
@@ -143,7 +145,7 @@ function extract_batch(array $filePaths, ?ExtractionConfig $config = null): arra
 /**
  * Server mode: read paths from stdin, write JSON to stdout
  */
-function run_server(?ExtractionConfig $config = null): void
+function run_server(?ExtractionConfig $config = null, bool $ocrEnabled = false): void
 {
     debug_log("=== SERVER MODE START ===");
 
@@ -173,6 +175,7 @@ function run_server(?ExtractionConfig $config = null): void
                 'content' => $result->content,
                 'metadata' => $result->metadata ?? [],
                 '_extraction_time_ms' => $durationMs,
+                '_ocr_used' => $ocrEnabled,
             ];
 
             echo json_encode($payload, JSON_THROW_ON_ERROR) . "\n";
@@ -181,6 +184,7 @@ function run_server(?ExtractionConfig $config = null): void
             $errorPayload = [
                 'error' => $e->getMessage(),
                 '_extraction_time_ms' => 0,
+                '_ocr_used' => $ocrEnabled,
             ];
             echo json_encode($errorPayload, JSON_THROW_ON_ERROR) . "\n";
             fflush(STDOUT);
@@ -237,7 +241,7 @@ function main(): void
         switch ($mode) {
             case 'server':
                 debug_log("Executing server mode");
-                run_server($config);
+                run_server($config, $ocrEnabled);
                 break;
 
             case 'sync':
@@ -246,7 +250,7 @@ function main(): void
                     exit(1);
                 }
                 debug_log("Executing sync mode with file: {$filePaths[0]}");
-                $payload = extract_sync($filePaths[0], $config);
+                $payload = extract_sync($filePaths[0], $config, $ocrEnabled);
                 $output = json_encode($payload, JSON_THROW_ON_ERROR);
                 debug_log("Output JSON: {$output}");
                 echo $output;
@@ -259,7 +263,7 @@ function main(): void
                 }
                 debug_log("Executing batch mode with " . count($filePaths) . " files");
 
-                $results = extract_batch($filePaths, $config);
+                $results = extract_batch($filePaths, $config, $ocrEnabled);
 
                 if (count($filePaths) === 1) {
                     $output = json_encode($results[0], JSON_THROW_ON_ERROR);
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract.py b/tools/benchmark-harness/scripts/kreuzberg_extract.py
@@ -39,6 +39,7 @@ def extract_sync(file_path: str, ocr_enabled: bool) -> dict[str, Any]:
         "content": result.content,
         "metadata": result.metadata or {},
         "_extraction_time_ms": duration_ms,
+        "_ocr_used": ocr_enabled,
     }
 
 
@@ -57,6 +58,7 @@ async def extract_async(file_path: str, ocr_enabled: bool) -> dict[str, Any]:
         "content": result.content,
         "metadata": result.metadata or {},
         "_extraction_time_ms": duration_ms,
+        "_ocr_used": ocr_enabled,
     }
 
 
@@ -79,6 +81,7 @@ def extract_batch_sync(file_paths: list[str], ocr_enabled: bool) -> list[dict[st
             "metadata": result.metadata or {},
             "_extraction_time_ms": per_file_duration_ms,
             "_batch_total_ms": total_duration_ms,
+            "_ocr_used": ocr_enabled,
         }
         for result in results
     ]
@@ -98,7 +101,7 @@ def run_server(ocr_enabled: bool) -> None:
             print(json.dumps(payload), flush=True)
         except Exception as e:
             duration_ms = (time.perf_counter() - start) * 1000.0
-            print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms}), flush=True)
+            print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms, "_ocr_used": ocr_enabled}), flush=True)
 
 
 def main() -> None:
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract.rb b/tools/benchmark-harness/scripts/kreuzberg_extract.rb
@@ -79,7 +79,8 @@ def extract_sync(file_path, config = {})
   payload = {
     content: result.content,
     metadata: result.metadata || {},
-    _extraction_time_ms: duration_ms
+    _extraction_time_ms: duration_ms,
+    _ocr_used: config.dig(:ocr, :enabled) || false
   }
 
   debug_log "Output JSON size: #{JSON.generate(payload).bytesize} bytes"
@@ -124,7 +125,8 @@ def extract_batch(file_paths, config = {})
       content: result.content,
       metadata: result.metadata || {},
       _extraction_time_ms: per_file_duration_ms,
-      _batch_total_ms: total_duration_ms
+      _batch_total_ms: total_duration_ms,
+      _ocr_used: config.dig(:ocr, :enabled) || false
     }
   end
 
@@ -162,15 +164,17 @@ def extract_server(ocr_enabled)
       payload = {
         content: result.content,
         metadata: result.metadata || {},
-        _extraction_time_ms: duration_ms
+        _extraction_time_ms: duration_ms,
+        _ocr_used: ocr_enabled
       }
 
       puts JSON.generate(payload)
       $stdout.flush
     rescue StandardError => e
       error_payload = {
         error: e.message,
-        _extraction_time_ms: 0
+        _extraction_time_ms: 0,
+        _ocr_used: ocr_enabled
       }
       puts JSON.generate(error_payload)
       $stdout.flush
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract.ts b/tools/benchmark-harness/scripts/kreuzberg_extract.ts
@@ -17,6 +17,7 @@ interface ExtractionOutput {
 	metadata: Record<string, unknown>;
 	_extraction_time_ms: number;
 	_batch_total_ms?: number;
+	_ocr_used: boolean;
 }
 
 function createConfig(ocrEnabled: boolean): ExtractionConfig {
@@ -36,6 +37,7 @@ async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<Extr
 		content: result.content,
 		metadata: result.metadata || {},
 		_extraction_time_ms: durationMs,
+		_ocr_used: ocrEnabled,
 	};
 }
 
@@ -52,6 +54,7 @@ async function extractBatch(filePaths: string[], ocrEnabled: boolean): Promise<E
 		metadata: result.metadata || {},
 		_extraction_time_ms: perFileDurationMs,
 		_batch_total_ms: totalDurationMs,
+		_ocr_used: ocrEnabled,
 	}));
 }
 
@@ -89,7 +92,7 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
 		} catch (err) {
 			const durationMs = performance.now() - start;
 			const error = err as Error;
-			console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs }));
+			console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs, _ocr_used: ocrEnabled }));
 		}
 	}
 }
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract_go.go b/tools/benchmark-harness/scripts/kreuzberg_extract_go.go
@@ -24,6 +24,7 @@ type payload struct {
 	Metadata         map[string]any `json:"metadata"`
 	ExtractionTimeMs float64        `json:"_extraction_time_ms"`
 	BatchTotalTimeMs float64        `json:"_batch_total_ms,omitempty"`
+	OcrUsed          bool           `json:"_ocr_used"`
 }
 
 func main() {
@@ -126,30 +127,31 @@ func runServer(ocrEnabled bool) {
 		absPath, err := filepath.Abs(filePath)
 		if err != nil {
 			debug("Failed to resolve path %s: %v", filePath, err)
-			mustEncodeError(err)
+			mustEncodeError(err, ocrEnabled)
 			continue
 		}
 
 		start := time.Now()
 		result, err := kz.ExtractFileSync(absPath, config)
 		if err != nil {
 			debug("Extraction failed for %s: %v", absPath, err)
-			mustEncodeError(err)
+			mustEncodeError(err, ocrEnabled)
 			continue
 		}
 
 		elapsed := time.Since(start).Seconds() * 1000.0
 		meta, err := metadataMap(result.Metadata)
 		if err != nil {
 			debug("metadataMap failed: %v", err)
-			mustEncodeError(err)
+			mustEncodeError(err, ocrEnabled)
 			continue
 		}
 
 		p := &payload{
 			Content:          result.Content,
 			Metadata:         meta,
 			ExtractionTimeMs: elapsed,
+			OcrUsed:          ocrEnabled,
 		}
 		mustEncodeNoNewline(p)
 		fmt.Println()
@@ -189,6 +191,7 @@ func extractSync(path string, ocrEnabled bool) (*payload, error) {
 		Content:          result.Content,
 		Metadata:         meta,
 		ExtractionTimeMs: elapsed,
+		OcrUsed:          ocrEnabled,
 	}, nil
 }
 
@@ -225,6 +228,7 @@ func extractBatch(paths []string, ocrEnabled bool) (any, error) {
 			Metadata:         meta,
 			ExtractionTimeMs: totalMs,
 			BatchTotalTimeMs: totalMs,
+			OcrUsed:          ocrEnabled,
 		}, nil
 	}
 
@@ -243,6 +247,7 @@ func extractBatch(paths []string, ocrEnabled bool) (any, error) {
 			Metadata:         meta,
 			ExtractionTimeMs: perMs,
 			BatchTotalTimeMs: totalMs,
+			OcrUsed:          ocrEnabled,
 		})
 	}
 	return out, nil
@@ -289,10 +294,11 @@ func mustEncodeNoNewline(value any) {
 	}
 }
 
-func mustEncodeError(err error) {
+func mustEncodeError(err error, ocrEnabled bool) {
 	errorMap := map[string]interface{}{
-		"error":                 err.Error(),
+		"error":               err.Error(),
 		"_extraction_time_ms": 0,
+		"_ocr_used":           ocrEnabled,
 	}
 	data, marshalErr := json.Marshal(errorMap)
 	if marshalErr != nil {
diff --git a/tools/benchmark-harness/scripts/kreuzberg_extract_wasm.ts b/tools/benchmark-harness/scripts/kreuzberg_extract_wasm.ts
diff --git a/tools/benchmark-harness/src/adapters/native.rs b/tools/benchmark-harness/src/adapters/native.rs

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ def extract_sync(file_path: str, ocr_enabled: bool) -> dict[str, Any]:`
`39`	`39`	`"content": result.content,`
`40`	`40`	`"metadata": result.metadata or {},`
`41`	`41`	`"_extraction_time_ms": duration_ms,`
	`42`	`+ "_ocr_used": ocr_enabled,`
`42`	`43`	`}`
`43`	`44`
`44`	`45`
`@@ -57,6 +58,7 @@ async def extract_async(file_path: str, ocr_enabled: bool) -> dict[str, Any]:`
`57`	`58`	`"content": result.content,`
`58`	`59`	`"metadata": result.metadata or {},`
`59`	`60`	`"_extraction_time_ms": duration_ms,`
	`61`	`+ "_ocr_used": ocr_enabled,`
`60`	`62`	`}`
`61`	`63`
`62`	`64`
`@@ -79,6 +81,7 @@ def extract_batch_sync(file_paths: list[str], ocr_enabled: bool) -> list[dict[st`
`79`	`81`	`"metadata": result.metadata or {},`
`80`	`82`	`"_extraction_time_ms": per_file_duration_ms,`
`81`	`83`	`"_batch_total_ms": total_duration_ms,`
	`84`	`+ "_ocr_used": ocr_enabled,`
`82`	`85`	`}`
`83`	`86`	`for result in results`
`84`	`87`	`]`
`@@ -98,7 +101,7 @@ def run_server(ocr_enabled: bool) -> None:`
`98`	`101`	`print(json.dumps(payload), flush=True)`
`99`	`102`	`except Exception as e:`
`100`	`103`	`duration_ms = (time.perf_counter() - start) * 1000.0`
`101`		`- print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms}), flush=True)`
	`104`	`+ print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms, "_ocr_used": ocr_enabled}), flush=True)`
`102`	`105`
`103`	`106`
`104`	`107`	`def main() -> None:`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ interface ExtractionOutput {`
`17`	`17`	`metadata: Record<string, unknown>;`
`18`	`18`	`_extraction_time_ms: number;`
`19`	`19`	`_batch_total_ms?: number;`
	`20`	`+ _ocr_used: boolean;`
`20`	`21`	`}`
`21`	`22`
`22`	`23`	`function createConfig(ocrEnabled: boolean): ExtractionConfig {`
`@@ -36,6 +37,7 @@ async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<Extr`
`36`	`37`	`content: result.content,`
`37`	`38`	`metadata: result.metadata \|\| {},`
`38`	`39`	`_extraction_time_ms: durationMs,`
	`40`	`+ _ocr_used: ocrEnabled,`
`39`	`41`	`};`
`40`	`42`	`}`
`41`	`43`
`@@ -52,6 +54,7 @@ async function extractBatch(filePaths: string[], ocrEnabled: boolean): Promise<E`
`52`	`54`	`metadata: result.metadata \|\| {},`
`53`	`55`	`_extraction_time_ms: perFileDurationMs,`
`54`	`56`	`_batch_total_ms: totalDurationMs,`
	`57`	`+ _ocr_used: ocrEnabled,`
`55`	`58`	`}));`
`56`	`59`	`}`
`57`	`60`
`@@ -89,7 +92,7 @@ async function runServer(ocrEnabled: boolean): Promise<void> {`
`89`	`92`	`} catch (err) {`
`90`	`93`	`const durationMs = performance.now() - start;`
`91`	`94`	`const error = err as Error;`
`92`		`- console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs }));`
	`95`	`+ console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs, _ocr_used: ocrEnabled }));`
`93`	`96`	`}`
`94`	`97`	`}`
`95`	`98`	`}`