Skip to content

Commit 3cdfb08

Browse files
committed
fix: benchmark BMP false positives and OCR status tracking
- Replace synthetic no-text BMP fixture with text-containing image converted from test_hello_world.png for proper OCR benchmarking - Add empty content check to batch adapter success determination, matching the existing single-mode behavior - Add _ocr_used field to all 7 binding extraction scripts (Python, Ruby, TypeScript, Elixir, PHP, Go, WASM) so the subprocess adapter correctly buckets results as with_ocr vs no_ocr
1 parent 8592898 commit 3cdfb08

File tree

10 files changed

+69
-39
lines changed

10 files changed

+69
-39
lines changed

test_documents/images/bmp_24.bmp

352 KB
Binary file not shown.

tools/benchmark-harness/fixtures/image_bmp.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
22
"document": "../../../test_documents/images/bmp_24.bmp",
33
"file_type": "bmp",
4-
"file_size": 120054,
4+
"file_size": 480054,
55
"expected_frameworks": [
66
"kreuzberg"
77
],
88
"metadata": {
9-
"description": "24-bit BMP image file for format coverage testing",
9+
"description": "24-bit BMP image with 'Hello World' text for OCR testing",
1010
"category": "image",
1111
"size_class": "small"
1212
}

tools/benchmark-harness/scripts/kreuzberg_extract.exs

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ defmodule KreuzbergExtract do
4444
@doc """
4545
Extract a single file synchronously.
4646
"""
47-
def extract_sync(file_path, config \\ %{}) do
47+
def extract_sync(file_path, config \\ %{}, ocr_enabled \\ false) do
4848
debug_log("=== SYNC EXTRACTION START ===")
4949
debug_log("Input: file_path=#{file_path}")
5050
debug_log("File exists: #{File.exists?(file_path)}")
@@ -78,7 +78,8 @@ defmodule KreuzbergExtract do
7878
payload = %{
7979
"content" => extraction_result.content,
8080
"metadata" => struct_to_map(extraction_result.metadata),
81-
"_extraction_time_ms" => duration_ms
81+
"_extraction_time_ms" => duration_ms,
82+
"_ocr_used" => ocr_enabled
8283
}
8384

8485
json_size = payload |> Jason.encode!() |> byte_size()
@@ -95,7 +96,7 @@ defmodule KreuzbergExtract do
9596
@doc """
9697
Extract multiple files in batch mode.
9798
"""
98-
def extract_batch(file_paths, config \\ %{}) do
99+
def extract_batch(file_paths, config \\ %{}, ocr_enabled \\ false) do
99100
debug_log("=== BATCH EXTRACTION START ===")
100101
debug_log("Input: #{length(file_paths)} files")
101102

@@ -144,7 +145,8 @@ defmodule KreuzbergExtract do
144145
"content" => extraction_result.content,
145146
"metadata" => struct_to_map(extraction_result.metadata),
146147
"_extraction_time_ms" => per_file_duration_ms,
147-
"_batch_total_ms" => total_duration_ms
148+
"_batch_total_ms" => total_duration_ms,
149+
"_ocr_used" => ocr_enabled
148150
}
149151
end)
150152

@@ -160,7 +162,7 @@ defmodule KreuzbergExtract do
160162
@doc """
161163
Server mode: read paths from stdin, write JSON to stdout.
162164
"""
163-
def run_server(config \\ %{}) do
165+
def run_server(config \\ %{}, ocr_enabled \\ false) do
164166
debug_log("=== SERVER MODE START ===")
165167

166168
# Signal readiness after BEAM VM + NIF initialization
@@ -173,7 +175,7 @@ defmodule KreuzbergExtract do
173175
debug_log("Processing file: #{file_path}")
174176

175177
try do
176-
case extract_sync(file_path, config) do
178+
case extract_sync(file_path, config, ocr_enabled) do
177179
{:ok, payload} ->
178180
json = Jason.encode!(payload)
179181
IO.write(json)
@@ -182,7 +184,8 @@ defmodule KreuzbergExtract do
182184
{:error, reason} ->
183185
error_payload = %{
184186
"error" => inspect(reason),
185-
"_extraction_time_ms" => 0
187+
"_extraction_time_ms" => 0,
188+
"_ocr_used" => ocr_enabled
186189
}
187190

188191
json = Jason.encode!(error_payload)
@@ -193,7 +196,8 @@ defmodule KreuzbergExtract do
193196
e ->
194197
error_payload = %{
195198
"error" => inspect(e),
196-
"_extraction_time_ms" => 0
199+
"_extraction_time_ms" => 0,
200+
"_ocr_used" => ocr_enabled
197201
}
198202

199203
json = Jason.encode!(error_payload)
@@ -246,7 +250,7 @@ defmodule KreuzbergExtract do
246250
case mode do
247251
"server" ->
248252
debug_log("Executing server mode")
249-
run_server(config)
253+
run_server(config, ocr_enabled)
250254

251255
"sync" ->
252256
if length(file_paths) != 1 do
@@ -256,7 +260,7 @@ defmodule KreuzbergExtract do
256260

257261
debug_log("Executing sync mode with file: #{hd(file_paths)}")
258262

259-
case extract_sync(hd(file_paths), config) do
263+
case extract_sync(hd(file_paths), config, ocr_enabled) do
260264
{:ok, payload} ->
261265
json = Jason.encode!(payload)
262266
debug_log("Output JSON: #{json}")
@@ -275,7 +279,7 @@ defmodule KreuzbergExtract do
275279

276280
debug_log("Executing batch mode with #{length(file_paths)} files")
277281

278-
case extract_batch(file_paths, config) do
282+
case extract_batch(file_paths, config, ocr_enabled) do
279283
{:ok, results} ->
280284
json =
281285
if length(file_paths) == 1 do

tools/benchmark-harness/scripts/kreuzberg_extract.php

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ function debug_log(string $message): void
4949
/**
5050
* Extract a single file synchronously
5151
*/
52-
function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
52+
function extract_sync(string $filePath, ?ExtractionConfig $config = null, bool $ocrEnabled = false): array
5353
{
5454
debug_log("=== SYNC EXTRACTION START ===");
5555
debug_log("Input: file_path={$filePath}");
@@ -82,6 +82,7 @@ function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
8282
'content' => $result->content,
8383
'metadata' => $result->metadata ?? [],
8484
'_extraction_time_ms' => $durationMs,
85+
'_ocr_used' => $ocrEnabled,
8586
];
8687

8788
debug_log("Output JSON size: " . strlen(json_encode($payload)) . " bytes");
@@ -93,7 +94,7 @@ function extract_sync(string $filePath, ?ExtractionConfig $config = null): array
9394
/**
9495
* Extract multiple files in batch
9596
*/
96-
function extract_batch(array $filePaths, ?ExtractionConfig $config = null): array
97+
function extract_batch(array $filePaths, ?ExtractionConfig $config = null, bool $ocrEnabled = false): array
9798
{
9899
debug_log("=== BATCH EXTRACTION START ===");
99100
debug_log("Input: " . count($filePaths) . " files");
@@ -132,6 +133,7 @@ function extract_batch(array $filePaths, ?ExtractionConfig $config = null): arra
132133
'metadata' => $result->metadata ?? [],
133134
'_extraction_time_ms' => $perFileDurationMs,
134135
'_batch_total_ms' => $totalDurationMs,
136+
'_ocr_used' => $ocrEnabled,
135137
];
136138
}
137139

@@ -143,7 +145,7 @@ function extract_batch(array $filePaths, ?ExtractionConfig $config = null): arra
143145
/**
144146
* Server mode: read paths from stdin, write JSON to stdout
145147
*/
146-
function run_server(?ExtractionConfig $config = null): void
148+
function run_server(?ExtractionConfig $config = null, bool $ocrEnabled = false): void
147149
{
148150
debug_log("=== SERVER MODE START ===");
149151

@@ -173,6 +175,7 @@ function run_server(?ExtractionConfig $config = null): void
173175
'content' => $result->content,
174176
'metadata' => $result->metadata ?? [],
175177
'_extraction_time_ms' => $durationMs,
178+
'_ocr_used' => $ocrEnabled,
176179
];
177180

178181
echo json_encode($payload, JSON_THROW_ON_ERROR) . "\n";
@@ -181,6 +184,7 @@ function run_server(?ExtractionConfig $config = null): void
181184
$errorPayload = [
182185
'error' => $e->getMessage(),
183186
'_extraction_time_ms' => 0,
187+
'_ocr_used' => $ocrEnabled,
184188
];
185189
echo json_encode($errorPayload, JSON_THROW_ON_ERROR) . "\n";
186190
fflush(STDOUT);
@@ -237,7 +241,7 @@ function main(): void
237241
switch ($mode) {
238242
case 'server':
239243
debug_log("Executing server mode");
240-
run_server($config);
244+
run_server($config, $ocrEnabled);
241245
break;
242246

243247
case 'sync':
@@ -246,7 +250,7 @@ function main(): void
246250
exit(1);
247251
}
248252
debug_log("Executing sync mode with file: {$filePaths[0]}");
249-
$payload = extract_sync($filePaths[0], $config);
253+
$payload = extract_sync($filePaths[0], $config, $ocrEnabled);
250254
$output = json_encode($payload, JSON_THROW_ON_ERROR);
251255
debug_log("Output JSON: {$output}");
252256
echo $output;
@@ -259,7 +263,7 @@ function main(): void
259263
}
260264
debug_log("Executing batch mode with " . count($filePaths) . " files");
261265

262-
$results = extract_batch($filePaths, $config);
266+
$results = extract_batch($filePaths, $config, $ocrEnabled);
263267

264268
if (count($filePaths) === 1) {
265269
$output = json_encode($results[0], JSON_THROW_ON_ERROR);

tools/benchmark-harness/scripts/kreuzberg_extract.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def extract_sync(file_path: str, ocr_enabled: bool) -> dict[str, Any]:
3939
"content": result.content,
4040
"metadata": result.metadata or {},
4141
"_extraction_time_ms": duration_ms,
42+
"_ocr_used": ocr_enabled,
4243
}
4344

4445

@@ -57,6 +58,7 @@ async def extract_async(file_path: str, ocr_enabled: bool) -> dict[str, Any]:
5758
"content": result.content,
5859
"metadata": result.metadata or {},
5960
"_extraction_time_ms": duration_ms,
61+
"_ocr_used": ocr_enabled,
6062
}
6163

6264

@@ -79,6 +81,7 @@ def extract_batch_sync(file_paths: list[str], ocr_enabled: bool) -> list[dict[st
7981
"metadata": result.metadata or {},
8082
"_extraction_time_ms": per_file_duration_ms,
8183
"_batch_total_ms": total_duration_ms,
84+
"_ocr_used": ocr_enabled,
8285
}
8386
for result in results
8487
]
@@ -98,7 +101,7 @@ def run_server(ocr_enabled: bool) -> None:
98101
print(json.dumps(payload), flush=True)
99102
except Exception as e:
100103
duration_ms = (time.perf_counter() - start) * 1000.0
101-
print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms}), flush=True)
104+
print(json.dumps({"error": str(e), "_extraction_time_ms": duration_ms, "_ocr_used": ocr_enabled}), flush=True)
102105

103106

104107
def main() -> None:

tools/benchmark-harness/scripts/kreuzberg_extract.rb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ def extract_sync(file_path, config = {})
7979
payload = {
8080
content: result.content,
8181
metadata: result.metadata || {},
82-
_extraction_time_ms: duration_ms
82+
_extraction_time_ms: duration_ms,
83+
_ocr_used: config.dig(:ocr, :enabled) || false
8384
}
8485

8586
debug_log "Output JSON size: #{JSON.generate(payload).bytesize} bytes"
@@ -124,7 +125,8 @@ def extract_batch(file_paths, config = {})
124125
content: result.content,
125126
metadata: result.metadata || {},
126127
_extraction_time_ms: per_file_duration_ms,
127-
_batch_total_ms: total_duration_ms
128+
_batch_total_ms: total_duration_ms,
129+
_ocr_used: config.dig(:ocr, :enabled) || false
128130
}
129131
end
130132

@@ -162,15 +164,17 @@ def extract_server(ocr_enabled)
162164
payload = {
163165
content: result.content,
164166
metadata: result.metadata || {},
165-
_extraction_time_ms: duration_ms
167+
_extraction_time_ms: duration_ms,
168+
_ocr_used: ocr_enabled
166169
}
167170

168171
puts JSON.generate(payload)
169172
$stdout.flush
170173
rescue StandardError => e
171174
error_payload = {
172175
error: e.message,
173-
_extraction_time_ms: 0
176+
_extraction_time_ms: 0,
177+
_ocr_used: ocr_enabled
174178
}
175179
puts JSON.generate(error_payload)
176180
$stdout.flush

tools/benchmark-harness/scripts/kreuzberg_extract.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ interface ExtractionOutput {
1717
metadata: Record<string, unknown>;
1818
_extraction_time_ms: number;
1919
_batch_total_ms?: number;
20+
_ocr_used: boolean;
2021
}
2122

2223
function createConfig(ocrEnabled: boolean): ExtractionConfig {
@@ -36,6 +37,7 @@ async function extractAsync(filePath: string, ocrEnabled: boolean): Promise<Extr
3637
content: result.content,
3738
metadata: result.metadata || {},
3839
_extraction_time_ms: durationMs,
40+
_ocr_used: ocrEnabled,
3941
};
4042
}
4143

@@ -52,6 +54,7 @@ async function extractBatch(filePaths: string[], ocrEnabled: boolean): Promise<E
5254
metadata: result.metadata || {},
5355
_extraction_time_ms: perFileDurationMs,
5456
_batch_total_ms: totalDurationMs,
57+
_ocr_used: ocrEnabled,
5558
}));
5659
}
5760

@@ -89,7 +92,7 @@ async function runServer(ocrEnabled: boolean): Promise<void> {
8992
} catch (err) {
9093
const durationMs = performance.now() - start;
9194
const error = err as Error;
92-
console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs }));
95+
console.log(JSON.stringify({ error: error.message, _extraction_time_ms: durationMs, _ocr_used: ocrEnabled }));
9396
}
9497
}
9598
}

tools/benchmark-harness/scripts/kreuzberg_extract_go.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ type payload struct {
2424
Metadata map[string]any `json:"metadata"`
2525
ExtractionTimeMs float64 `json:"_extraction_time_ms"`
2626
BatchTotalTimeMs float64 `json:"_batch_total_ms,omitempty"`
27+
OcrUsed bool `json:"_ocr_used"`
2728
}
2829

2930
func main() {
@@ -126,30 +127,31 @@ func runServer(ocrEnabled bool) {
126127
absPath, err := filepath.Abs(filePath)
127128
if err != nil {
128129
debug("Failed to resolve path %s: %v", filePath, err)
129-
mustEncodeError(err)
130+
mustEncodeError(err, ocrEnabled)
130131
continue
131132
}
132133

133134
start := time.Now()
134135
result, err := kz.ExtractFileSync(absPath, config)
135136
if err != nil {
136137
debug("Extraction failed for %s: %v", absPath, err)
137-
mustEncodeError(err)
138+
mustEncodeError(err, ocrEnabled)
138139
continue
139140
}
140141

141142
elapsed := time.Since(start).Seconds() * 1000.0
142143
meta, err := metadataMap(result.Metadata)
143144
if err != nil {
144145
debug("metadataMap failed: %v", err)
145-
mustEncodeError(err)
146+
mustEncodeError(err, ocrEnabled)
146147
continue
147148
}
148149

149150
p := &payload{
150151
Content: result.Content,
151152
Metadata: meta,
152153
ExtractionTimeMs: elapsed,
154+
OcrUsed: ocrEnabled,
153155
}
154156
mustEncodeNoNewline(p)
155157
fmt.Println()
@@ -189,6 +191,7 @@ func extractSync(path string, ocrEnabled bool) (*payload, error) {
189191
Content: result.Content,
190192
Metadata: meta,
191193
ExtractionTimeMs: elapsed,
194+
OcrUsed: ocrEnabled,
192195
}, nil
193196
}
194197

@@ -225,6 +228,7 @@ func extractBatch(paths []string, ocrEnabled bool) (any, error) {
225228
Metadata: meta,
226229
ExtractionTimeMs: totalMs,
227230
BatchTotalTimeMs: totalMs,
231+
OcrUsed: ocrEnabled,
228232
}, nil
229233
}
230234

@@ -243,6 +247,7 @@ func extractBatch(paths []string, ocrEnabled bool) (any, error) {
243247
Metadata: meta,
244248
ExtractionTimeMs: perMs,
245249
BatchTotalTimeMs: totalMs,
250+
OcrUsed: ocrEnabled,
246251
})
247252
}
248253
return out, nil
@@ -289,10 +294,11 @@ func mustEncodeNoNewline(value any) {
289294
}
290295
}
291296

292-
func mustEncodeError(err error) {
297+
func mustEncodeError(err error, ocrEnabled bool) {
293298
errorMap := map[string]interface{}{
294-
"error": err.Error(),
299+
"error": err.Error(),
295300
"_extraction_time_ms": 0,
301+
"_ocr_used": ocrEnabled,
296302
}
297303
data, marshalErr := json.Marshal(errorMap)
298304
if marshalErr != nil {

0 commit comments

Comments
 (0)