Skip to content

Commit f4236c8

Browse files
authored
Fix/pdf miner source property (#228)
This PR adds three possible values for `source` field: * `pdfminer` as source for elements directly obtained from PDFs. * `OCR-tesseract` and `OCR-paddle` for elements obtained with the respective OCR engines. All those new values are stored in a new class `Source` in unstructured_inference>constants.py This would help users filter certain elements depending on how were obtained.
1 parent c4d3e8b commit f4236c8

File tree

10 files changed

+62
-16
lines changed

10 files changed

+62
-16
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.6.5-dev0
2+
3+
* Fix `source` property for elements generated by pdfminer.
4+
* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.
5+
16
## 0.6.4
27

38
* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task

test_unstructured_inference/inference/test_layout.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
from PIL import Image
1111

1212
import unstructured_inference.models.base as models
13-
from unstructured_inference.constants import OCRMode
13+
from unstructured_inference.constants import OCRMode, Source
1414
from unstructured_inference.inference import elements, layout, layoutelement
1515
from unstructured_inference.models import chipper, detectron2, tesseract
16+
from unstructured_inference.models.base import get_model
1617
from unstructured_inference.models.unstructuredmodel import (
1718
UnstructuredElementExtractionModel,
1819
UnstructuredObjectDetectionModel,
@@ -117,6 +118,19 @@ def detect(self, *args):
117118
assert elements.ocr(text_block, image=image) == ""
118119

119120

121+
def test_ocr_source():
122+
file = "sample-docs/loremipsum-flat.pdf"
123+
model = get_model("yolox_tiny")
124+
doc = layout.DocumentLayout.from_file(
125+
file,
126+
model,
127+
ocr_mode=OCRMode.FULL_PAGE.value,
128+
supplement_with_ocr_elements=True,
129+
ocr_strategy="force",
130+
)
131+
assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements}
132+
133+
120134
class MockLayoutModel:
121135
def __init__(self, layout):
122136
self.layout_return = layout
@@ -678,6 +692,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
678692
@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
679693
def test_load_pdf(filename):
680694
layouts, images = layout.load_pdf(f"sample-docs/{filename}")
695+
assert Source.PDFMINER in {e.source for e in layouts[0]}
681696
assert len(layouts)
682697
for lo in layouts:
683698
assert len(lo)

test_unstructured_inference/inference/test_layout_element.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from layoutparser.elements import TextBlock
33
from layoutparser.elements.layout_elements import Rectangle as LPRectangle
44

5-
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
5+
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source
66
from unstructured_inference.inference.elements import TextRegion
77
from unstructured_inference.inference.layoutelement import (
88
LayoutElement,
@@ -166,7 +166,7 @@ def test_layout_element_from_lp_textblock():
166166
300,
167167
300,
168168
text="Sample Text",
169-
source="detectron2_lp",
169+
source=Source.DETECTRON2_LP,
170170
type="Text",
171171
prob=0.99,
172172
)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.4" # pragma: no cover
1+
__version__ = "0.6.5-dev0" # pragma: no cover

unstructured_inference/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,15 @@ class AnnotationResult(Enum):
1111
PLOT = "plot"
1212

1313

14+
class Source(Enum):
15+
YOLOX = "yolox"
16+
DETECTRON2_ONNX = "detectron2_onnx"
17+
DETECTRON2_LP = "detectron2_lp"
18+
OCR_TESSERACT = "OCR-tesseract"
19+
OCR_PADDLE = "OCR-paddle"
20+
PDFMINER = "pdfminer"
21+
MERGED = "merged"
22+
23+
1424
SUBREGION_THRESHOLD_FOR_OCR = 0.5
1525
FULL_PAGE_REGION_THRESHOLD = 0.99

unstructured_inference/inference/elements.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from scipy.sparse.csgraph import connected_components
1313

1414
from unstructured_inference.config import inference_config
15+
from unstructured_inference.constants import Source
1516
from unstructured_inference.logger import logger
1617
from unstructured_inference.math import safe_division
1718
from unstructured_inference.models import tesseract
@@ -197,7 +198,7 @@ def intersections(*rects: Rectangle):
197198
@dataclass
198199
class TextRegion(Rectangle):
199200
text: Optional[str] = None
200-
source: Optional[str] = None
201+
source: Optional[Source] = None
201202

202203
def __str__(self) -> str:
203204
return str(self.text)

unstructured_inference/inference/layout.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from PIL import Image, ImageSequence
1414
from pytesseract import Output
1515

16-
from unstructured_inference.constants import OCRMode
16+
from unstructured_inference.constants import OCRMode, Source
1717
from unstructured_inference.inference.elements import (
1818
EmbeddedTextRegion,
1919
ImageTextRegion,
@@ -677,7 +677,14 @@ def load_pdf(
677677
else:
678678
continue
679679

680-
text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
680+
text_region = element_class(
681+
x1 * coef,
682+
y1 * coef,
683+
x2 * coef,
684+
y2 * coef,
685+
text=_text,
686+
source=Source.PDFMINER,
687+
)
681688

682689
if text_region.area > 0:
683690
layout.append(text_region)
@@ -738,7 +745,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
738745
(x1, y1, x2, y2) = l, t, l + w, t + h
739746
text = ocr_data["text"][i]
740747
if text:
741-
text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR")
748+
text_region = TextRegion(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT)
742749
text_regions.append(text_region)
743750

744751
return text_regions
@@ -774,7 +781,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
774781
y2 = max([i[1] for i in line[0]])
775782
text = line[1][0]
776783
if text:
777-
text_region = TextRegion(x1, y1, x2, y2, text)
784+
text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE)
778785
text_regions.append(text_region)
779786

780787
return text_regions

unstructured_inference/inference/layoutelement.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from PIL import Image
1010

1111
from unstructured_inference.config import inference_config
12-
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
12+
from unstructured_inference.constants import (
13+
FULL_PAGE_REGION_THRESHOLD,
14+
SUBREGION_THRESHOLD_FOR_OCR,
15+
Source,
16+
)
1317
from unstructured_inference.inference.elements import (
1418
ImageTextRegion,
1519
Rectangle,
@@ -74,7 +78,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
7478
text = textblock.text
7579
type = textblock.type
7680
prob = textblock.score
77-
return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob)
81+
return cls(x1, y1, x2, y2, text=text, source=Source.DETECTRON2_LP, type=type, prob=prob)
7882

7983

8084
def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
@@ -311,8 +315,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion:
311315

312316
merged_text = " ".join([tr.text for tr in regions if tr.text])
313317
sources = [*{tr.source for tr in regions}]
314-
source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore
315-
return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
318+
source = sources.pop() if len(sources) == 1 else Source.MERGED
319+
element = TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
320+
setattr(element, "merged_sources", sources)
321+
return element
316322

317323

318324
def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]:
@@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
332338
r.x2,
333339
r.y2,
334340
text=r.text,
335-
source=None,
341+
source=r.source,
336342
type="UncategorizedText",
337343
)
338344
for r in merged_regions

unstructured_inference/models/detectron2onnx.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from onnxruntime.quantization import QuantType, quantize_dynamic
1010
from PIL import Image
1111

12+
from unstructured_inference.constants import Source
1213
from unstructured_inference.inference.layoutelement import LayoutElement
1314
from unstructured_inference.logger import logger, logger_onnx
1415
from unstructured_inference.models.unstructuredmodel import (
@@ -158,7 +159,7 @@ def postprocess(
158159
text=None,
159160
type=detected_class,
160161
prob=conf,
161-
source="detectron2_onnx",
162+
source=Source.DETECTRON2_ONNX,
162163
)
163164

164165
regions.append(region)

unstructured_inference/models/yolox.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from onnxruntime.quantization import QuantType, quantize_dynamic
1515
from PIL import Image
1616

17+
from unstructured_inference.constants import Source
1718
from unstructured_inference.inference.layoutelement import LayoutElement
1819
from unstructured_inference.logger import logger
1920
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
@@ -149,7 +150,7 @@ def image_processing(
149150
text=None,
150151
type=detected_class,
151152
prob=prob,
152-
source="yolox",
153+
source=Source.YOLOX,
153154
)
154155

155156
regions.append(region)

0 commit comments

Comments
 (0)