Skip to content

Commit cf76c6c

Browse files
committed
fix: CI failures across Elixir, C#, Go, and musl builds
- Add PaddleOCR skip handling to C# and Go E2E generators - Add missing AssertDocument, AssertOcrElements, SkipIfPaddleOcrUnavailable helpers to C# and Go test templates - Fix Elixir doctest missing include_document_structure field - Fix musl build __GNUC_PREREQ macro in tesseract wrapper - Fix paddle_pdf_scanned.json path typo (pdfs/ -> pdf/) - Revert getrandom to 0.3.4 for WASM compatibility
1 parent 442fcba commit cf76c6c

File tree

12 files changed

+339
-261
lines changed

12 files changed

+339
-261
lines changed

Cargo.lock

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ toml = "0.9.11"
6969
tempfile = "3.24.0"
7070
criterion = { version = "0.8", features = ["html_reports"] }
7171
lzma-rust2 = { version = "0.15.7" }
72-
getrandom = { version = "0.4.1", features = ["wasm_js"] }
72+
getrandom = { version = "0.3.4", features = ["wasm_js"] }
7373

7474
[profile.release]
7575
lto = "thin"

crates/kreuzberg-tesseract/build.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,11 @@ mod build_tesseract {
166166
# Prepends musl C headers so they shadow glibc's.\n\
167167
# Defines glibc compat macros as 0 for musl -- handles os_defines.h,\n\
168168
# libc-header-start.h, floatn.h etc. that use __GLIBC_PREREQ().\n\
169+
# Also defines __GNUC_PREREQ for floatn.h which checks compiler version.\n\
169170
exec g++ -isystem \"{musl_include}\" \\\n\
170171
'-D__GLIBC_PREREQ(maj,min)=0' \\\n\
171172
'-D__GLIBC_USE(F)=0' \\\n\
173+
'-D__GNUC_PREREQ(maj,min)=0' \\\n\
172174
\"$@\"\n"
173175
);
174176

crates/kreuzberg-wasm/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ serde-wasm-bindgen = "0.6"
2323
js-sys = "0.3"
2424
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console"] }
2525
# Use getrandom 0.3 with wasm_js feature for WASM compatibility
26-
getrandom = { version = "0.4", features = ["wasm_js"] }
26+
getrandom = { version = "0.3", features = ["wasm_js"] }
2727
uuid = { version = "1.20", features = ["js"] }
2828
serde = { workspace = true }
2929
serde_json = { workspace = true }

e2e/csharp/Helpers.cs

Lines changed: 55 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public static void SkipIfLegacyOfficeDisabled(string relativePath)
104104

105105
public static void SkipIfOfficeTestOnWindows(string relativePath)
106106
{
107-
// Office tests timeout on Windows due to LibreOffice conversion delays
107+
// Office tests timeout on Windows
108108
if (OperatingSystem.IsWindows())
109109
{
110110
var ext = Path.GetExtension(relativePath).ToLowerInvariant();
@@ -116,6 +116,15 @@ public static void SkipIfOfficeTestOnWindows(string relativePath)
116116
}
117117
}
118118

119+
public static void SkipIfPaddleOcrUnavailable()
120+
{
121+
var flag = Environment.GetEnvironmentVariable("KREUZBERG_PADDLE_OCR_AVAILABLE");
122+
if (string.IsNullOrWhiteSpace(flag) || flag == "0" || flag.Equals("false", StringComparison.OrdinalIgnoreCase))
123+
{
124+
throw new Xunit.SkipException();
125+
}
126+
}
127+
119128
public static ExtractionConfig? BuildConfig(string? configJson)
120129
{
121130
if (string.IsNullOrWhiteSpace(configJson))
@@ -562,145 +571,95 @@ public static void AssertElements(
562571
public static void AssertOcrElements(
563572
ExtractionResult result,
564573
bool? hasElements,
565-
bool? elementsHaveGeometry,
566-
bool? elementsHaveConfidence,
574+
bool? hasGeometry,
575+
bool? hasConfidence,
567576
int? minCount)
568577
{
569578
var ocrElements = result.OcrElements;
570579
if (hasElements == true)
571580
{
572-
if (ocrElements is null)
573-
{
574-
throw new XunitException("Expected ocr_elements but got null");
575-
}
576-
if (ocrElements.Count == 0)
581+
if (ocrElements is null || ocrElements.Count == 0)
577582
{
578-
throw new XunitException("Expected ocr_elements to be non-empty");
583+
throw new XunitException("Expected OCR elements but none found");
579584
}
580585
}
581-
if (ocrElements is not null && ocrElements.Count > 0)
586+
if (ocrElements is not null)
582587
{
583-
if (minCount.HasValue && ocrElements.Count < minCount.Value)
584-
{
585-
throw new XunitException($"Expected at least {minCount.Value} ocr_elements, found {ocrElements.Count}");
586-
}
587-
if (elementsHaveGeometry == true)
588+
if (hasGeometry == true)
588589
{
589590
for (var i = 0; i < ocrElements.Count; i++)
590591
{
591-
var el = ocrElements[i];
592-
if (el.Geometry is null)
592+
if (ocrElements[i].Geometry is null)
593593
{
594-
throw new XunitException($"OCR element {i} has no geometry");
595-
}
596-
var geomType = el.Geometry.Type.ToString().ToLowerInvariant();
597-
if (geomType != "rectangle" && geomType != "quadrilateral")
598-
{
599-
throw new XunitException($"OCR element {i} has invalid geometry type: {geomType}");
594+
throw new XunitException($"OCR element {i} expected to have geometry");
600595
}
601596
}
602597
}
603-
if (elementsHaveConfidence == true)
598+
if (hasConfidence == true)
604599
{
605600
for (var i = 0; i < ocrElements.Count; i++)
606601
{
607-
var el = ocrElements[i];
608-
if (el.Confidence is null)
602+
if (ocrElements[i].Confidence is null)
609603
{
610-
throw new XunitException($"OCR element {i} has no confidence");
611-
}
612-
if (el.Confidence.Recognition <= 0)
613-
{
614-
throw new XunitException($"OCR element {i} has invalid confidence recognition: {el.Confidence.Recognition}");
604+
throw new XunitException($"OCR element {i} expected to have confidence score");
615605
}
616606
}
617607
}
608+
if (minCount.HasValue && ocrElements.Count < minCount.Value)
609+
{
610+
throw new XunitException($"Expected at least {minCount.Value} OCR elements, found {ocrElements.Count}");
611+
}
618612
}
619613
}
620614

621615
public static void AssertDocument(
622616
ExtractionResult result,
623-
bool hasDocument = false,
624-
int? minNodeCount = null,
625-
IEnumerable<string>? nodeTypesInclude = null,
626-
bool? hasGroups = null)
617+
bool hasDocument,
618+
int? minNodeCount,
619+
IEnumerable<string>? nodeTypesInclude,
620+
bool? hasGroups)
627621
{
628622
var document = result.Document;
629-
if (!hasDocument)
623+
if (hasDocument)
630624
{
631-
if (document is not null)
625+
if (document is null)
632626
{
633-
throw new XunitException($"Expected document to be null but got {document.GetType()}");
627+
throw new XunitException("Expected document but got null");
634628
}
635-
return;
636-
}
637-
if (document is null)
638-
{
639-
throw new XunitException("Expected document but got null");
640-
}
641-
642-
// Extract nodes from document structure
643-
List<DocumentNode>? nodes = null;
644-
if (document is DocumentStructure docStruct)
645-
{
646-
nodes = docStruct.Nodes;
647-
}
648-
else
649-
{
650-
throw new XunitException($"Expected DocumentStructure but got {document.GetType()}");
651-
}
652-
653-
if (nodes is null)
654-
{
655-
throw new XunitException("Expected document.nodes but got null");
656-
}
657-
658-
if (minNodeCount.HasValue && nodes.Count < minNodeCount.Value)
659-
{
660-
throw new XunitException($"Expected at least {minNodeCount.Value} nodes, found {nodes.Count}");
661-
}
662-
663-
if (nodeTypesInclude is not null)
664-
{
665-
var foundTypes = new HashSet<string>();
666-
foreach (var node in nodes)
629+
var nodes = document.Nodes;
630+
if (nodes is null)
667631
{
668-
string? nodeType = node.Content?.NodeType;
669-
if (!string.IsNullOrEmpty(nodeType))
670-
{
671-
foundTypes.Add(nodeType);
672-
}
632+
throw new XunitException("Expected document nodes but got null");
673633
}
674-
675-
foreach (var expectedType in nodeTypesInclude)
634+
if (minNodeCount.HasValue && nodes.Count < minNodeCount.Value)
676635
{
677-
if (!foundTypes.Contains(expectedType))
678-
{
679-
throw new XunitException($"Expected node type '{expectedType}' not found in [{string.Join(", ", foundTypes)}]");
680-
}
636+
throw new XunitException($"Expected at least {minNodeCount.Value} nodes, found {nodes.Count}");
681637
}
682-
}
683-
684-
if (hasGroups.HasValue)
685-
{
686-
bool hasGroupNodes = false;
687-
foreach (var node in nodes)
638+
if (nodeTypesInclude is not null)
688639
{
689-
string? nodeType = node.Content?.NodeType;
690-
if (nodeType == "group")
640+
var foundTypes = nodes.Select(n => n.Content?.NodeType ?? "").ToHashSet();
641+
foreach (var expected in nodeTypesInclude)
691642
{
692-
hasGroupNodes = true;
693-
break;
643+
if (!foundTypes.Any(t => string.Equals(t, expected, StringComparison.OrdinalIgnoreCase)))
644+
{
645+
throw new XunitException($"Expected node type '{expected}' not found in [{string.Join(", ", foundTypes)}]");
646+
}
694647
}
695648
}
696-
697-
if (hasGroups.Value && !hasGroupNodes)
649+
if (hasGroups.HasValue)
698650
{
699-
throw new XunitException("Expected document to have group nodes but found none");
651+
var hasGroupNodes = nodes.Any(n => string.Equals(n.Content?.NodeType, "group", StringComparison.OrdinalIgnoreCase));
652+
if (hasGroupNodes != hasGroups.Value)
653+
{
654+
throw new XunitException($"Expected hasGroups={hasGroups.Value} but got {hasGroupNodes}");
655+
}
700656
}
701-
if (!hasGroups.Value && hasGroupNodes)
657+
}
658+
else
659+
{
660+
if (document is not null)
702661
{
703-
throw new XunitException("Expected document to not have group nodes but found some");
662+
throw new XunitException($"Expected document to be null but got a document");
704663
}
705664
}
706665
}

e2e/csharp/OcrTests.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ public void OcrImageNoText()
4040
[SkippableFact]
4141
public void OcrPaddleConfidenceFilter()
4242
{
43+
TestHelpers.SkipIfPaddleOcrUnavailable();
4344
TestHelpers.SkipIfLegacyOfficeDisabled("images/ocr_image.jpg");
4445
TestHelpers.SkipIfOfficeTestOnWindows("images/ocr_image.jpg");
4546
var documentPath = TestHelpers.EnsureDocument("images/ocr_image.jpg", true);
@@ -53,6 +54,7 @@ public void OcrPaddleConfidenceFilter()
5354
[SkippableFact]
5455
public void OcrPaddleImageChinese()
5556
{
57+
TestHelpers.SkipIfPaddleOcrUnavailable();
5658
TestHelpers.SkipIfLegacyOfficeDisabled("images/chi_sim_image.jpeg");
5759
TestHelpers.SkipIfOfficeTestOnWindows("images/chi_sim_image.jpeg");
5860
var documentPath = TestHelpers.EnsureDocument("images/chi_sim_image.jpeg", true);
@@ -66,6 +68,7 @@ public void OcrPaddleImageChinese()
6668
[SkippableFact]
6769
public void OcrPaddleImageEnglish()
6870
{
71+
TestHelpers.SkipIfPaddleOcrUnavailable();
6972
TestHelpers.SkipIfLegacyOfficeDisabled("images/test_hello_world.png");
7073
TestHelpers.SkipIfOfficeTestOnWindows("images/test_hello_world.png");
7174
var documentPath = TestHelpers.EnsureDocument("images/test_hello_world.png", true);
@@ -80,6 +83,7 @@ public void OcrPaddleImageEnglish()
8083
[SkippableFact]
8184
public void OcrPaddleMarkdown()
8285
{
86+
TestHelpers.SkipIfPaddleOcrUnavailable();
8387
TestHelpers.SkipIfLegacyOfficeDisabled("images/test_hello_world.png");
8488
TestHelpers.SkipIfOfficeTestOnWindows("images/test_hello_world.png");
8589
var documentPath = TestHelpers.EnsureDocument("images/test_hello_world.png", true);
@@ -94,9 +98,10 @@ public void OcrPaddleMarkdown()
9498
[SkippableFact]
9599
public void OcrPaddlePdfScanned()
96100
{
97-
TestHelpers.SkipIfLegacyOfficeDisabled("pdfs/ocr_test.pdf");
98-
TestHelpers.SkipIfOfficeTestOnWindows("pdfs/ocr_test.pdf");
99-
var documentPath = TestHelpers.EnsureDocument("pdfs/ocr_test.pdf", true);
101+
TestHelpers.SkipIfPaddleOcrUnavailable();
102+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/ocr_test.pdf");
103+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/ocr_test.pdf");
104+
var documentPath = TestHelpers.EnsureDocument("pdf/ocr_test.pdf", true);
100105
var config = TestHelpers.BuildConfig("{\"force_ocr\":true,\"ocr\":{\"backend\":\"paddle-ocr\",\"language\":\"en\"}}");
101106

102107
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
@@ -108,6 +113,7 @@ public void OcrPaddlePdfScanned()
108113
[SkippableFact]
109114
public void OcrPaddleStructured()
110115
{
116+
TestHelpers.SkipIfPaddleOcrUnavailable();
111117
TestHelpers.SkipIfLegacyOfficeDisabled("images/test_hello_world.png");
112118
TestHelpers.SkipIfOfficeTestOnWindows("images/test_hello_world.png");
113119
var documentPath = TestHelpers.EnsureDocument("images/test_hello_world.png", true);
@@ -122,6 +128,7 @@ public void OcrPaddleStructured()
122128
[SkippableFact]
123129
public void OcrPaddleTableDetection()
124130
{
131+
TestHelpers.SkipIfPaddleOcrUnavailable();
125132
TestHelpers.SkipIfLegacyOfficeDisabled("images/simple_table.png");
126133
TestHelpers.SkipIfOfficeTestOnWindows("images/simple_table.png");
127134
var documentPath = TestHelpers.EnsureDocument("images/simple_table.png", true);

0 commit comments

Comments
 (0)