Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions ocr_service/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from scansynclib.logging import logger
from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus
from scansynclib.sqlite_wrapper import update_scanneddata_database
from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq
from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text
import pickle
import ocrmypdf
import os
from datetime import datetime
import time
import pika.exceptions
Expand Down Expand Up @@ -36,13 +37,26 @@ def start_processing(item: ProcessItem):

try:
result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120)
logger.debug(f"OCR exited with code {result}")

if result != 0:
logger.error(f"OCR exited with code {result}")
item.ocr_status = OCRStatus.FAILED
else:
logger.info(f"OCR processing completed: {item.filename}")
logger.debug(f"OCR exited with code {result}")
item.ocr_status = OCRStatus.COMPLETED

# Verify that the OCR file actually contains text
if os.path.exists(item.ocr_file):
extracted_text = extract_text(item.ocr_file).strip()
if extracted_text:
logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}")
item.ocr_status = OCRStatus.COMPLETED
else:
logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}")
item.ocr_status = OCRStatus.FAILED
else:
logger.error(f"OCR output file not found: {item.ocr_file}")
item.ocr_status = OCRStatus.OUTPUT_ERROR
except ocrmypdf.UnsupportedImageFormatError:
logger.error(f"Unsupported image format: {item.local_file_path}")
item.ocr_status = OCRStatus.UNSUPPORTED
Expand Down
62 changes: 62 additions & 0 deletions tests/test_ocr_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest
import os
import tempfile
from unittest.mock import Mock, patch, mock_open
from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus


class TestOCRTextVerification:
"""Test OCR text verification functionality without importing the main OCR service."""

def test_extract_text_returns_empty_string_on_empty_pdf(self):
"""Test that extract_text returns empty string for a PDF with no text."""
from scansynclib.helpers import extract_text

# Create a temporary file that simulates an empty PDF
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_file.write(b"%PDF-1.4\n") # Minimal PDF header
temp_file_path = temp_file.name

try:
# extract_text should return empty string for malformed/empty PDF
result = extract_text(temp_file_path)
assert result == ""
finally:
os.unlink(temp_file_path)

def test_extract_text_returns_empty_string_on_nonexistent_file(self):
"""Test that extract_text returns empty string for non-existent file."""
from scansynclib.helpers import extract_text

result = extract_text("/nonexistent/file.pdf")
assert result == ""

@patch('scansynclib.helpers.PdfReader')
def test_extract_text_strips_whitespace(self, mock_pdf_reader):
"""Test that extract_text properly handles text with whitespace."""
from scansynclib.helpers import extract_text

# Mock the PDF reader to return text with whitespace
mock_page = Mock()
mock_page.extract_text.return_value = " \n\t Some text \n\t "
mock_reader = Mock()
mock_reader.pages = [mock_page]
mock_pdf_reader.return_value = mock_reader

result = extract_text("dummy_path.pdf")
assert result == " \n\t Some text \n\t " # Should return raw text, not stripped

def test_process_item_has_ocr_file_attribute(self):
"""Test that ProcessItem correctly sets the OCR file path."""
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_file_path = temp_file.name

try:
item = ProcessItem(temp_file_path, ItemType.PDF)

# Verify OCR file path is set correctly
assert hasattr(item, 'ocr_file')
assert item.ocr_file.endswith('_OCR.pdf')
assert item.ocr_status == OCRStatus.UNKNOWN
finally:
os.unlink(temp_file_path)