maxi07 · Copilot · Aug 28, 2025 · Aug 28, 2025
diff --git a/ocr_service/main.py b/ocr_service/main.py
@@ -1,9 +1,10 @@
 from scansynclib.logging import logger
 from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus
 from scansynclib.sqlite_wrapper import update_scanneddata_database
-from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq
+from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text
 import pickle
 import ocrmypdf
+import os
 from datetime import datetime
 import time
 import pika.exceptions
@@ -36,13 +37,26 @@ def start_processing(item: ProcessItem):
 
     try:
         result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120)
+        logger.debug(f"OCR exited with code {result}")
+
         if result != 0:
             logger.error(f"OCR exited with code {result}")
             item.ocr_status = OCRStatus.FAILED
         else:
             logger.info(f"OCR processing completed: {item.filename}")
-        logger.debug(f"OCR exited with code {result}")
-        item.ocr_status = OCRStatus.COMPLETED
+
+            # Verify that the OCR file actually contains text
+            if os.path.exists(item.ocr_file):
+                extracted_text = extract_text(item.ocr_file).strip()
+                if extracted_text:
+                    logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}")
+                    item.ocr_status = OCRStatus.COMPLETED
+                else:
+                    logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}")
+                    item.ocr_status = OCRStatus.FAILED
+            else:
+                logger.error(f"OCR output file not found: {item.ocr_file}")
+                item.ocr_status = OCRStatus.OUTPUT_ERROR
     except ocrmypdf.UnsupportedImageFormatError:
         logger.error(f"Unsupported image format: {item.local_file_path}")
         item.ocr_status = OCRStatus.UNSUPPORTED

diff --git a/tests/test_ocr_verification.py b/tests/test_ocr_verification.py
@@ -0,0 +1,62 @@
+import pytest
+import os
+import tempfile
+from unittest.mock import Mock, patch, mock_open
+from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus
+
+
+class TestOCRTextVerification:
+    """Test OCR text verification functionality without importing the main OCR service."""
+
+    def test_extract_text_returns_empty_string_on_empty_pdf(self):
+        """Test that extract_text returns empty string for a PDF with no text."""
+        from scansynclib.helpers import extract_text
+
+        # Create a temporary file that simulates an empty PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file.write(b"%PDF-1.4\n")  # Minimal PDF header
+            temp_file_path = temp_file.name
+
+        try:
+            # extract_text should return empty string for malformed/empty PDF
+            result = extract_text(temp_file_path)
+            assert result == ""
+        finally:
+            os.unlink(temp_file_path)
+
+    def test_extract_text_returns_empty_string_on_nonexistent_file(self):
+        """Test that extract_text returns empty string for non-existent file."""
+        from scansynclib.helpers import extract_text
+
+        result = extract_text("/nonexistent/file.pdf")
+        assert result == ""
+
+    @patch('scansynclib.helpers.PdfReader')
+    def test_extract_text_strips_whitespace(self, mock_pdf_reader):
+        """Test that extract_text properly handles text with whitespace."""
+        from scansynclib.helpers import extract_text
+
+        # Mock the PDF reader to return text with whitespace
+        mock_page = Mock()
+        mock_page.extract_text.return_value = "  \n\t  Some text  \n\t  "
+        mock_reader = Mock()
+        mock_reader.pages = [mock_page]
+        mock_pdf_reader.return_value = mock_reader
+
+        result = extract_text("dummy_path.pdf")
+        assert result == "  \n\t  Some text  \n\t  "  # Should return raw text, not stripped
+
+    def test_process_item_has_ocr_file_attribute(self):
+        """Test that ProcessItem correctly sets the OCR file path."""
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file_path = temp_file.name
+
+        try:
+            item = ProcessItem(temp_file_path, ItemType.PDF)
+
+            # Verify OCR file path is set correctly
+            assert hasattr(item, 'ocr_file')
+            assert item.ocr_file.endswith('_OCR.pdf')
+            assert item.ocr_status == OCRStatus.UNKNOWN
+        finally:
+            os.unlink(temp_file_path)