Skip to content

Commit dcf7c01

Browse files
committed
Improve test assertions for Camel docling ocr
note that the footer is not found Signed-off-by: Aurélien Pupier <apupier@ibm.com>
1 parent 350486b commit dcf7c01

File tree

1 file changed

+30
-4
lines changed
  • components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration

1 file changed

+30
-4
lines changed

components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration/OcrExtractionIT.java

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
* This test demonstrates how to use Docling's OCR capabilities to extract text from images containing text content.
4949
*/
5050
@DisabledIfSystemProperty(named = "ci.env.name", matches = ".*", disabledReason = "Too much resources on GitHub Actions")
51-
public class OcrExtractionIT extends CamelTestSupport {
51+
class OcrExtractionIT extends CamelTestSupport {
5252

5353
private static final Logger LOG = LoggerFactory.getLogger(OcrExtractionIT.class);
5454

@@ -76,7 +76,7 @@ protected CamelContext createCamelContext() throws Exception {
7676
}
7777

7878
@Test
79-
public void testOcrTextExtractionFromImage() throws Exception {
79+
void testOcrTextExtractionFromImage() throws Exception {
8080
Path testImage = createTestImageWithText();
8181

8282
LOG.info("Created test image at: {}", testImage);
@@ -88,6 +88,12 @@ public void testOcrTextExtractionFromImage() throws Exception {
8888

8989
LOG.info("OCR extraction result:\n{}", result);
9090

91+
checkExtractedText(result);
92+
93+
LOG.info("Successfully extracted text from image using OCR");
94+
}
95+
96+
private void checkExtractedText(String result) {
9197
// Verify that at least some of the expected text was extracted
9298
// Note: OCR may not be 100% accurate, so we check for partial matches
9399
String resultLower = result.toLowerCase();
@@ -97,8 +103,6 @@ public void testOcrTextExtractionFromImage() throws Exception {
97103

98104
assertTrue(foundHello || foundApache || foundOcr,
99105
"OCR should extract at least some of the expected text. Got: " + result);
100-
101-
LOG.info("Successfully extracted text from image using OCR");
102106
}
103107

104108
@Test
@@ -110,6 +114,8 @@ public void testOcrMarkdownConversionFromImage() throws Exception {
110114
assertNotNull(result, "Markdown result should not be null");
111115
assertTrue(result.length() > 0, "Markdown result should not be empty");
112116

117+
checkExtractedText(result);
118+
113119
LOG.info("OCR Markdown conversion result:\n{}", result);
114120
LOG.info("Successfully converted image to Markdown using OCR");
115121
}
@@ -124,6 +130,8 @@ public void testOcrJsonConversionFromImage() throws Exception {
124130
assertTrue(result.length() > 0, "JSON result should not be empty");
125131
assertTrue(result.contains("{") || result.contains("["), "Result should be valid JSON");
126132

133+
checkExtractedText(result);
134+
127135
LOG.info("OCR JSON conversion result:\n{}", result);
128136
LOG.info("Successfully converted image to JSON using OCR");
129137
}
@@ -137,6 +145,8 @@ public void testOcrWithAsyncMode() throws Exception {
137145
assertNotNull(result, "Async OCR result should not be null");
138146
assertTrue(result.length() > 0, "Async OCR result should not be empty");
139147

148+
checkExtractedText(result);
149+
140150
LOG.info("Async OCR extraction result:\n{}", result);
141151
LOG.info("Successfully extracted text from image using async OCR");
142152
}
@@ -150,6 +160,8 @@ public void testOcrFromPngImage() throws Exception {
150160
assertNotNull(result, "OCR result from PNG should not be null");
151161
assertTrue(result.length() > 0, "OCR result from PNG should not be empty");
152162

163+
checkExtractedText(result);
164+
153165
LOG.info("OCR extraction from PNG result:\n{}", result);
154166
LOG.info("Successfully extracted text from PNG image using OCR");
155167
}
@@ -163,6 +175,20 @@ public void testOcrWithMultipleTextBlocks() throws Exception {
163175
assertNotNull(result, "OCR result should not be null");
164176
assertTrue(result.length() > 0, "OCR result should not be empty");
165177

178+
// Verify that at least some of the expected text was extracted
179+
// Note: OCR may not be 100% accurate, so we check for partial matches
180+
String resultLower = result.toLowerCase();
181+
boolean foundFirst = resultLower.contains("first");
182+
boolean foundSecond = resultLower.contains("second");
183+
184+
assertTrue(foundFirst && foundSecond,
185+
"OCR should extract at least some of the expected text. Got: " + result);
186+
187+
// TODO: footer is not found by the ocr by Camel docling
188+
// boolean foundFooter = resultLower.contains("footer");
189+
// assertTrue(foundFooter,
190+
// "OCR should extract at least some of the expected text from the footer. Got: " + result);
191+
166192
LOG.info("OCR extraction with multiple text blocks result:\n{}", result);
167193
LOG.info("Successfully extracted text from image with multiple text blocks");
168194
}

0 commit comments

Comments
 (0)