|
6 | 6 | import pandas as pd |
7 | 7 | import pytest |
8 | 8 | import unstructured_pytesseract |
| 9 | +from bs4 import BeautifulSoup, Tag |
9 | 10 | from pdf2image.exceptions import PDFPageCountError |
10 | 11 | from PIL import Image, UnidentifiedImageError |
11 | 12 | from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion |
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch): |
71 | 72 |
|
72 | 73 | def test_get_ocr_layout_from_image_tesseract(monkeypatch): |
73 | 74 | monkeypatch.setattr( |
74 | | - unstructured_pytesseract, |
75 | | - "image_to_data", |
| 75 | + OCRAgentTesseract, |
| 76 | + "image_to_data_with_character_confidence_filter", |
76 | 77 | lambda *args, **kwargs: pd.DataFrame( |
77 | 78 | { |
78 | 79 | "left": [10, 20, 30, 0], |
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch): |
445 | 446 | monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") |
446 | 447 | monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") |
447 | 448 | monkeypatch.setattr( |
448 | | - unstructured_pytesseract, |
449 | | - "image_to_data", |
| 449 | + OCRAgentTesseract, |
| 450 | + "image_to_data_with_character_confidence_filter", |
450 | 451 | lambda *args, **kwargs: pd.DataFrame( |
451 | 452 | { |
452 | 453 | "left": [10, 20, 30, 0], |
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): |
484 | 485 | # Check if the final layout contains both original elements and OCR-derived elements |
485 | 486 | assert all(element in final_layout for element in mock_out_layout) |
486 | 487 | assert any(element in final_layout for element in ocr_elements) |
| 488 | + |
| 489 | + |
| 490 | +def _create_hocr_word_span( |
| 491 | + characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int] |
| 492 | +) -> Tag: |
| 493 | + word_span = BeautifulSoup( |
| 494 | + f"<span class='ocrx_word' title='" |
| 495 | + f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}" |
| 496 | + f"; x_wconf 64'></span>", |
| 497 | + "html.parser", |
| 498 | + ).span |
| 499 | + for char, x_conf in characters: |
| 500 | + char_span = BeautifulSoup( |
| 501 | + f""" |
| 502 | + <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span> |
| 503 | + """, # noqa : E501 |
| 504 | + "html.parser", |
| 505 | + ).span |
| 506 | + word_span.append(char_span) |
| 507 | + return word_span |
| 508 | + |
| 509 | + |
| 510 | +def test_extract_word_from_hocr(): |
| 511 | + characters = [ |
| 512 | + ("w", "99.0"), |
| 513 | + ("o", "98.5"), |
| 514 | + ("r", "97.5"), |
| 515 | + ("d", "96.0"), |
| 516 | + ("!", "50.0"), |
| 517 | + ("@", "45.0"), |
| 518 | + ] |
| 519 | + word_bbox = (10, 9, 70, 22) |
| 520 | + word_span = _create_hocr_word_span(characters, word_bbox) |
| 521 | + |
| 522 | + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) |
| 523 | + assert text == "word!@" |
| 524 | + |
| 525 | + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) |
| 526 | + assert text == "word" |
| 527 | + |
| 528 | + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) |
| 529 | + assert text == "w" |
| 530 | + |
| 531 | + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) |
| 532 | + assert text == "" |
| 533 | + |
| 534 | + |
| 535 | +def test_hocr_to_dataframe(): |
| 536 | + characters = [ |
| 537 | + ("w", "99.0"), |
| 538 | + ("o", "98.5"), |
| 539 | + ("r", "97.5"), |
| 540 | + ("d", "96.0"), |
| 541 | + ("!", "50.0"), |
| 542 | + ("@", "45.0"), |
| 543 | + ] |
| 544 | + word_bbox = (10, 9, 70, 22) |
| 545 | + hocr = str(_create_hocr_word_span(characters, word_bbox)) |
| 546 | + df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960) |
| 547 | + |
| 548 | + assert df.shape == (1, 5) |
| 549 | + assert df["left"].iloc[0] == 10 |
| 550 | + assert df["top"].iloc[0] == 9 |
| 551 | + assert df["width"].iloc[0] == 60 |
| 552 | + assert df["height"].iloc[0] == 13 |
| 553 | + assert df["text"].iloc[0] == "word" |
| 554 | + |
| 555 | + |
| 556 | +def test_hocr_to_dataframe_when_no_prediction_empty_df(): |
| 557 | + df = OCRAgentTesseract().hocr_to_dataframe(hocr="") |
| 558 | + |
| 559 | + assert df.shape == (0, 5) |
| 560 | + assert "left" in df.columns |
| 561 | + assert "top" in df.columns |
| 562 | + assert "width" in df.columns |
| 563 | + assert "text" in df.columns |
| 564 | + assert "text" in df.columns |
0 commit comments