Skip to content

Commit faa3430

Browse files
committed
chore: expand E2E config coverage and fix binding helpers
Add fixtures for chunking (text type), security limits, and language detection (detect_multiple). Adjust pages to use exact_count, remove cross-binding incompatible assertions (djot_content, image formats_include, table content_contains_any). Fix Elixir helper Access bug on DocumentNode struct and Java hasGeometry parameter nullability.
1 parent 9cec568 commit faa3430

36 files changed

+755
-259
lines changed

e2e/c/test_contract.c

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -177,21 +177,21 @@ static void test_contract_config_chunking_small(void) {
177177
kreuzberg_free_result(result);
178178
}
179179

180-
static void test_contract_config_djot_content(void) {
181-
if (skip_if_feature_unavailable("pdf")) return;
182-
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"output_format\":\"djot\"}");
180+
static void test_contract_config_chunking_text(void) {
181+
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"chunking\":{\"chunker_type\":\"text\",\"max_chars\":500,\"max_overlap\":50}}");
183182
if (!result) return; /* skipped */
184183
assert_expected_mime(result, (const char *[]){"application/pdf"}, 1);
185184
assert_min_content_length(result, 10);
185+
assert_chunks(result, 1, 1, 0, 0);
186186
kreuzberg_free_result(result);
187187
}
188188

189-
static void test_contract_config_djot_content_blocks(void) {
189+
static void test_contract_config_djot_content(void) {
190190
if (skip_if_feature_unavailable("pdf")) return;
191191
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"output_format\":\"djot\"}");
192192
if (!result) return; /* skipped */
193193
assert_expected_mime(result, (const char *[]){"application/pdf"}, 1);
194-
assert_djot_content(result, 1, 1, 1, 1);
194+
assert_min_content_length(result, 10);
195195
kreuzberg_free_result(result);
196196
}
197197

@@ -273,7 +273,6 @@ static void test_contract_config_images(void) {
273273
}
274274

275275
static void test_contract_config_images_with_formats(void) {
276-
if (skip_if_feature_unavailable("office")) return;
277276
CExtractionResult *result = run_extraction("pptx/powerpoint_with_image.pptx", "{\"images\":{\"extract_images\":true}}");
278277
if (!result) return; /* skipped */
279278
assert_expected_mime(result, (const char *[]){"application/vnd.openxmlformats-officedocument.presentationml.presentation"}, 1);
@@ -300,6 +299,15 @@ static void test_contract_config_language_detection(void) {
300299
kreuzberg_free_result(result);
301300
}
302301

302+
static void test_contract_config_language_detection_multi(void) {
303+
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"language_detection\":{\"detect_multiple\":true,\"enabled\":true,\"min_confidence\":0.3}}");
304+
if (!result) return; /* skipped */
305+
assert_expected_mime(result, (const char *[]){"application/pdf"}, 1);
306+
assert_min_content_length(result, 10);
307+
assert_detected_languages(result, (const char *[]){"eng"}, 1);
308+
kreuzberg_free_result(result);
309+
}
310+
303311
static void test_contract_config_language_multi(void) {
304312
if (skip_if_feature_unavailable("language-detection")) return;
305313
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"language_detection\":{\"detect_multiple\":true,\"enabled\":true}}");
@@ -326,7 +334,7 @@ static void test_contract_config_pages_exact_count(void) {
326334
if (!result) return; /* skipped */
327335
assert_expected_mime(result, (const char *[]){"application/pdf"}, 1);
328336
assert_min_content_length(result, 10);
329-
assert_pages(result, 1, 2, 0, 0);
337+
assert_pages(result, 0, 0, 1, 5);
330338
kreuzberg_free_result(result);
331339
}
332340

@@ -423,6 +431,14 @@ static void test_contract_config_quality_score_range(void) {
423431
kreuzberg_free_result(result);
424432
}
425433

434+
static void test_contract_config_security_limits(void) {
435+
CExtractionResult *result = run_extraction("archives/documents.zip", "{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}");
436+
if (!result) return; /* skipped */
437+
assert_expected_mime(result, (const char *[]){"application/zip", "application/x-zip-compressed"}, 2);
438+
assert_min_content_length(result, 10);
439+
kreuzberg_free_result(result);
440+
}
441+
426442
static void test_contract_config_structured_output(void) {
427443
if (skip_if_feature_unavailable("pdf")) return;
428444
CExtractionResult *result = run_extraction("pdf/fake_memo.pdf", "{\"output_format\":\"structured\"}");
@@ -516,8 +532,8 @@ int main(void) {
516532
test_contract_config_chunking();
517533
test_contract_config_chunking_markdown();
518534
test_contract_config_chunking_small();
535+
test_contract_config_chunking_text();
519536
test_contract_config_djot_content();
520-
test_contract_config_djot_content_blocks();
521537
test_contract_config_document_structure();
522538
test_contract_config_document_structure_disabled();
523539
test_contract_config_document_structure_groups();
@@ -530,6 +546,7 @@ int main(void) {
530546
test_contract_config_images_with_formats();
531547
test_contract_config_keywords();
532548
test_contract_config_language_detection();
549+
test_contract_config_language_detection_multi();
533550
test_contract_config_language_multi();
534551
test_contract_config_pages();
535552
test_contract_config_pages_exact_count();
@@ -543,6 +560,7 @@ int main(void) {
543560
test_contract_config_quality_disabled();
544561
test_contract_config_quality_enabled();
545562
test_contract_config_quality_score_range();
563+
test_contract_config_security_limits();
546564
test_contract_config_structured_output();
547565
test_contract_config_tables_content();
548566
test_contract_config_use_cache_false();

e2e/csharp/ContractTests.cs

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -180,21 +180,21 @@ public void ConfigChunkingSmall()
180180
}
181181

182182
[SkippableFact]
183-
public void ConfigDjotContent()
183+
public void ConfigChunkingText()
184184
{
185-
TestHelpers.SkipIfFeatureUnavailable("pdf");
186185
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
187186
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
188187
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
189-
var config = TestHelpers.BuildConfig("{\"output_format\":\"djot\"}");
188+
var config = TestHelpers.BuildConfig("{\"chunking\":{\"chunker_type\":\"text\",\"max_chars\":500,\"max_overlap\":50}}");
190189

191190
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
192191
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
193192
TestHelpers.AssertMinContentLength(result, 10);
193+
TestHelpers.AssertChunks(result, 1, null, true, null);
194194
}
195195

196196
[SkippableFact]
197-
public void ConfigDjotContentBlocks()
197+
public void ConfigDjotContent()
198198
{
199199
TestHelpers.SkipIfFeatureUnavailable("pdf");
200200
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
@@ -204,7 +204,7 @@ public void ConfigDjotContentBlocks()
204204

205205
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
206206
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
207-
TestHelpers.AssertDjotContent(result, true, 1);
207+
TestHelpers.AssertMinContentLength(result, 10);
208208
}
209209

210210
[SkippableFact]
@@ -332,15 +332,14 @@ public void ConfigImages()
332332
[SkippableFact]
333333
public void ConfigImagesWithFormats()
334334
{
335-
TestHelpers.SkipIfFeatureUnavailable("office");
336335
TestHelpers.SkipIfLegacyOfficeDisabled("pptx/powerpoint_with_image.pptx");
337336
TestHelpers.SkipIfOfficeTestOnWindows("pptx/powerpoint_with_image.pptx");
338337
var documentPath = TestHelpers.EnsureDocument("pptx/powerpoint_with_image.pptx", true);
339338
var config = TestHelpers.BuildConfig("{\"images\":{\"extract_images\":true}}");
340339

341340
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
342341
TestHelpers.AssertExpectedMime(result, new[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation" });
343-
TestHelpers.AssertImages(result, 1, null, new[] { "png" });
342+
TestHelpers.AssertImages(result, 1, null, null);
344343
}
345344

346345
[SkippableFact]
@@ -372,6 +371,20 @@ public void ConfigLanguageDetection()
372371
TestHelpers.AssertDetectedLanguages(result, new[] { "eng" }, 0.5);
373372
}
374373

374+
[SkippableFact]
375+
public void ConfigLanguageDetectionMulti()
376+
{
377+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
378+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
379+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
380+
var config = TestHelpers.BuildConfig("{\"language_detection\":{\"detect_multiple\":true,\"enabled\":true,\"min_confidence\":0.3}}");
381+
382+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
383+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
384+
TestHelpers.AssertMinContentLength(result, 10);
385+
TestHelpers.AssertDetectedLanguages(result, new[] { "eng" }, null);
386+
}
387+
375388
[SkippableFact]
376389
public void ConfigLanguageMulti()
377390
{
@@ -414,7 +427,7 @@ public void ConfigPagesExactCount()
414427
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
415428
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
416429
TestHelpers.AssertMinContentLength(result, 10);
417-
TestHelpers.AssertPages(result, 2, null);
430+
TestHelpers.AssertPages(result, null, 5);
418431
}
419432

420433
[SkippableFact]
@@ -560,6 +573,19 @@ public void ConfigQualityScoreRange()
560573
TestHelpers.AssertQualityScore(result, true, 0.1, null);
561574
}
562575

576+
[SkippableFact]
577+
public void ConfigSecurityLimits()
578+
{
579+
TestHelpers.SkipIfLegacyOfficeDisabled("archives/documents.zip");
580+
TestHelpers.SkipIfOfficeTestOnWindows("archives/documents.zip");
581+
var documentPath = TestHelpers.EnsureDocument("archives/documents.zip", true);
582+
var config = TestHelpers.BuildConfig("{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}");
583+
584+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
585+
TestHelpers.AssertExpectedMime(result, new[] { "application/zip", "application/x-zip-compressed" });
586+
TestHelpers.AssertMinContentLength(result, 10);
587+
}
588+
563589
[SkippableFact]
564590
public void ConfigStructuredOutput()
565591
{
@@ -585,7 +611,6 @@ public void ConfigTablesContent()
585611
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
586612
TestHelpers.AssertExpectedMime(result, new[] { "application/vnd.openxmlformats-officedocument.wordprocessingml.document" });
587613
TestHelpers.AssertTableCount(result, 1, null);
588-
TestHelpers.AssertTableContentContainsAny(result, new[] { "Header Col" });
589614
}
590615

591616
[SkippableFact]

e2e/elixir/test/e2e/contract_test.exs

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -276,19 +276,20 @@ defmodule E2E.ContractTest do
276276
end
277277
end
278278

279-
test "config_djot_content" do
279+
test "config_chunking_text" do
280280
case E2E.Helpers.run_fixture(
281-
"config_djot_content",
281+
"config_chunking_text",
282282
"pdf/fake_memo.pdf",
283-
%{output_format: "djot"},
284-
requirements: ["pdf"],
283+
%{chunking: %{chunker_type: "text", max_chars: 500, max_overlap: 50}},
284+
requirements: [],
285285
notes: nil,
286286
skip_if_missing: true
287287
) do
288288
{:ok, result} ->
289289
result
290290
|> E2E.Helpers.assert_expected_mime(["application/pdf"])
291291
|> E2E.Helpers.assert_min_content_length(10)
292+
|> E2E.Helpers.assert_chunks(min_count: 1, each_has_content: true)
292293

293294
{:skipped, reason} ->
294295
IO.puts("SKIPPED: #{reason}")
@@ -298,9 +299,9 @@ defmodule E2E.ContractTest do
298299
end
299300
end
300301

301-
test "config_djot_content_blocks" do
302+
test "config_djot_content" do
302303
case E2E.Helpers.run_fixture(
303-
"config_djot_content_blocks",
304+
"config_djot_content",
304305
"pdf/fake_memo.pdf",
305306
%{output_format: "djot"},
306307
requirements: ["pdf"],
@@ -310,7 +311,7 @@ defmodule E2E.ContractTest do
310311
{:ok, result} ->
311312
result
312313
|> E2E.Helpers.assert_expected_mime(["application/pdf"])
313-
|> E2E.Helpers.assert_djot_content(has_content: true, min_blocks: 1)
314+
|> E2E.Helpers.assert_min_content_length(10)
314315

315316
{:skipped, reason} ->
316317
IO.puts("SKIPPED: #{reason}")
@@ -536,7 +537,7 @@ defmodule E2E.ContractTest do
536537
"config_images_with_formats",
537538
"pptx/powerpoint_with_image.pptx",
538539
%{images: %{extract_images: true}},
539-
requirements: ["office"],
540+
requirements: [],
540541
notes: nil,
541542
skip_if_missing: true
542543
) do
@@ -545,7 +546,7 @@ defmodule E2E.ContractTest do
545546
|> E2E.Helpers.assert_expected_mime([
546547
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
547548
])
548-
|> E2E.Helpers.assert_images(min_count: 1, formats_include: ["png"])
549+
|> E2E.Helpers.assert_images(min_count: 1)
549550

550551
{:skipped, reason} ->
551552
IO.puts("SKIPPED: #{reason}")
@@ -601,6 +602,29 @@ defmodule E2E.ContractTest do
601602
end
602603
end
603604

605+
test "config_language_detection_multi" do
606+
case E2E.Helpers.run_fixture(
607+
"config_language_detection_multi",
608+
"pdf/fake_memo.pdf",
609+
%{language_detection: %{detect_multiple: true, enabled: true, min_confidence: 0.3}},
610+
requirements: [],
611+
notes: nil,
612+
skip_if_missing: true
613+
) do
614+
{:ok, result} ->
615+
result
616+
|> E2E.Helpers.assert_expected_mime(["application/pdf"])
617+
|> E2E.Helpers.assert_min_content_length(10)
618+
|> E2E.Helpers.assert_detected_languages(["eng"], nil)
619+
620+
{:skipped, reason} ->
621+
IO.puts("SKIPPED: #{reason}")
622+
623+
{:error, reason} ->
624+
flunk("Extraction failed: #{inspect(reason)}")
625+
end
626+
end
627+
604628
test "config_language_multi" do
605629
case E2E.Helpers.run_fixture(
606630
"config_language_multi",
@@ -660,7 +684,7 @@ defmodule E2E.ContractTest do
660684
result
661685
|> E2E.Helpers.assert_expected_mime(["application/pdf"])
662686
|> E2E.Helpers.assert_min_content_length(10)
663-
|> E2E.Helpers.assert_pages(min_count: 2)
687+
|> E2E.Helpers.assert_pages(exact_count: 5)
664688

665689
{:skipped, reason} ->
666690
IO.puts("SKIPPED: #{reason}")
@@ -896,6 +920,28 @@ defmodule E2E.ContractTest do
896920
end
897921
end
898922

923+
test "config_security_limits" do
924+
case E2E.Helpers.run_fixture(
925+
"config_security_limits",
926+
"archives/documents.zip",
927+
%{security_limits: %{max_archive_size: 104_857_600, max_compression_ratio: 50, max_files_in_archive: 100}},
928+
requirements: [],
929+
notes: nil,
930+
skip_if_missing: true
931+
) do
932+
{:ok, result} ->
933+
result
934+
|> E2E.Helpers.assert_expected_mime(["application/zip", "application/x-zip-compressed"])
935+
|> E2E.Helpers.assert_min_content_length(10)
936+
937+
{:skipped, reason} ->
938+
IO.puts("SKIPPED: #{reason}")
939+
940+
{:error, reason} ->
941+
flunk("Extraction failed: #{inspect(reason)}")
942+
end
943+
end
944+
899945
test "config_structured_output" do
900946
case E2E.Helpers.run_fixture(
901947
"config_structured_output",
@@ -933,7 +979,6 @@ defmodule E2E.ContractTest do
933979
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
934980
])
935981
|> E2E.Helpers.assert_table_count(1, nil)
936-
|> E2E.Helpers.assert_table_content_contains_any(["Header Col"])
937982

938983
{:skipped, reason} ->
939984
IO.puts("SKIPPED: #{reason}")

e2e/elixir/test/support/e2e_helpers.ex

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -481,10 +481,15 @@ defmodule E2E.Helpers do
481481
if is_boolean(opts[:has_groups]) do
482482
has_group_nodes =
483483
Enum.any?(nodes, fn node ->
484-
content = node[:content] || node["content"]
484+
content = Map.get(node, :content) || Map.get(node, "content")
485485

486486
node_type =
487-
if content, do: content[:node_type] || content["node_type"], else: node[:node_type] || node["node_type"]
487+
if content do
488+
(if is_struct(content), do: Map.get(content, :node_type), else: nil) ||
489+
Map.get(content, :node_type) || Map.get(content, "node_type")
490+
else
491+
Map.get(node, :node_type) || Map.get(node, "node_type")
492+
end
488493

489494
node_type == "group"
490495
end)

0 commit comments

Comments
 (0)