4848 * This test demonstrates how to use Docling's OCR capabilities to extract text from images containing text content.
4949 */
5050@ DisabledIfSystemProperty (named = "ci.env.name" , matches = ".*" , disabledReason = "Too much resources on GitHub Actions" )
51- public class OcrExtractionIT extends CamelTestSupport {
51+ class OcrExtractionIT extends CamelTestSupport {
5252
5353 private static final Logger LOG = LoggerFactory .getLogger (OcrExtractionIT .class );
5454
@@ -76,7 +76,7 @@ protected CamelContext createCamelContext() throws Exception {
7676 }
7777
7878 @ Test
79- public void testOcrTextExtractionFromImage () throws Exception {
79+ void testOcrTextExtractionFromImage () throws Exception {
8080 Path testImage = createTestImageWithText ();
8181
8282 LOG .info ("Created test image at: {}" , testImage );
@@ -88,6 +88,12 @@ public void testOcrTextExtractionFromImage() throws Exception {
8888
8989 LOG .info ("OCR extraction result:\n {}" , result );
9090
91+ checkExtractedText (result );
92+
93+ LOG .info ("Successfully extracted text from image using OCR" );
94+ }
95+
96+ private void checkExtractedText (String result ) {
9197 // Verify that at least some of the expected text was extracted
9298 // Note: OCR may not be 100% accurate, so we check for partial matches
9399 String resultLower = result .toLowerCase ();
@@ -97,8 +103,6 @@ public void testOcrTextExtractionFromImage() throws Exception {
97103
98104 assertTrue (foundHello || foundApache || foundOcr ,
99105 "OCR should extract at least some of the expected text. Got: " + result );
100-
101- LOG .info ("Successfully extracted text from image using OCR" );
102106 }
103107
104108 @ Test
@@ -110,6 +114,8 @@ public void testOcrMarkdownConversionFromImage() throws Exception {
110114 assertNotNull (result , "Markdown result should not be null" );
111115 assertTrue (result .length () > 0 , "Markdown result should not be empty" );
112116
117+ checkExtractedText (result );
118+
113119 LOG .info ("OCR Markdown conversion result:\n {}" , result );
114120 LOG .info ("Successfully converted image to Markdown using OCR" );
115121 }
@@ -124,6 +130,8 @@ public void testOcrJsonConversionFromImage() throws Exception {
124130 assertTrue (result .length () > 0 , "JSON result should not be empty" );
125131 assertTrue (result .contains ("{" ) || result .contains ("[" ), "Result should be valid JSON" );
126132
133+ checkExtractedText (result );
134+
127135 LOG .info ("OCR JSON conversion result:\n {}" , result );
128136 LOG .info ("Successfully converted image to JSON using OCR" );
129137 }
@@ -137,6 +145,8 @@ public void testOcrWithAsyncMode() throws Exception {
137145 assertNotNull (result , "Async OCR result should not be null" );
138146 assertTrue (result .length () > 0 , "Async OCR result should not be empty" );
139147
148+ checkExtractedText (result );
149+
140150 LOG .info ("Async OCR extraction result:\n {}" , result );
141151 LOG .info ("Successfully extracted text from image using async OCR" );
142152 }
@@ -150,6 +160,8 @@ public void testOcrFromPngImage() throws Exception {
150160 assertNotNull (result , "OCR result from PNG should not be null" );
151161 assertTrue (result .length () > 0 , "OCR result from PNG should not be empty" );
152162
163+ checkExtractedText (result );
164+
153165 LOG .info ("OCR extraction from PNG result:\n {}" , result );
154166 LOG .info ("Successfully extracted text from PNG image using OCR" );
155167 }
@@ -163,6 +175,20 @@ public void testOcrWithMultipleTextBlocks() throws Exception {
163175 assertNotNull (result , "OCR result should not be null" );
164176 assertTrue (result .length () > 0 , "OCR result should not be empty" );
165177
178+ // Verify that at least some of the expected text was extracted
179+ // Note: OCR may not be 100% accurate, so we check for partial matches
180+ String resultLower = result .toLowerCase ();
181+ boolean foundFirst = resultLower .contains ("first" );
182+ boolean foundSecond = resultLower .contains ("second" );
183+
184+ assertTrue (foundFirst && foundSecond ,
185+ "OCR should extract at least some of the expected text. Got: " + result );
186+
187+ // TODO: footer is not found by the ocr by Camel docling
188+ // boolean foundFooter = resultLower.contains("footer");
189+ // assertTrue(foundFooter,
190+ // "OCR should extract at least some of the expected text from the footer. Got: " + result);
191+
166192 LOG .info ("OCR extraction with multiple text blocks result:\n {}" , result );
167193 LOG .info ("Successfully extracted text from image with multiple text blocks" );
168194 }
0 commit comments