Skip to content

Commit 7cd724a

Browse files
authored
remove cv2 preprocessing (#204)
This PR removes `cv2` preprocessing before OCR step in table transformer. The preprocessing in practice tend to lead to lower recall from OCR results. For example this image: ![table-multi-row-column-cells](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/baffd3bd-d34d-404f-83f2-b29f3c42fb6e) with current code: ```python from unstructured_inference.models.tables import UnstructuredTableTransformerModel from PIL import Image model = UnstructuredTableTransformerModel() model.initialize("microsoft/table-transformer-structure-recognition") prediction = model.predict(Image.open('table.png')) ``` produces ![Screenshot 2023-09-08 at 9 39 07 AM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/00857b02-c75e-446c-8c34-0b6e2d42b0f7) after removing the preprocessing (this PR): ![Screenshot 2023-09-08 at 12 15 13 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/2171570c-641a-46a1-9be7-4664497d7727)
1 parent bdee102 commit 7cd724a

File tree

5 files changed

+92
-129
lines changed

5 files changed

+92
-129
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.24
2+
3+
* remove `cv2` preprocessing step before OCR step in table transformer
4+
15
## 0.5.23
26

37
* Add functionality to bring back embedded images in PDF
77.9 KB
Loading

test_unstructured_inference/models/test_tables.py

Lines changed: 82 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -34,102 +34,6 @@ def test_load_donut_model(model_path):
3434
assert type(table_model.model.model.decoder) is TableTransformerDecoder
3535

3636

37-
@pytest.fixture()
38-
def sample_table_transcript(platform_type):
39-
if platform_type == "x86_64":
40-
out = (
41-
'<table><thead><th colspan="6">About these Coverage Examples:</th></thead><thead><th>'
42-
'</th><th colspan="5">This is not a cost estimator. Treatments shown are just examples '
43-
"of how this plan might cover medical care. Your actual costs will be different "
44-
"depending on the actual care you receive, the prices your providers charge, and many "
45-
"other factors. Focus on the cost sharing amounts (deductibles, copayments and "
46-
"coinsurance) and excluded services under the plan. Use this information to compare "
47-
"the portion of costs you might pay under different health plans. Please note these "
48-
'coverage examples are based on self-only coverage</th></thead><thead><th colspan="2">'
49-
"Peg is Having a Baby (9 months of in-network pre-natal care and a hospital delivery)</"
50-
"th><th>Managing Joe's type 2 Diabetes (a year of routine in-network care of a well- "
51-
'controlled condition)</th><th></th><th colspan="2">Mia\'s Simple Fracture (in-network '
52-
'emergency room visit and follow up care)</th></thead><tr><td colspan="2">The plan\'s '
53-
"overall deductible $750 Specialist copayment $50 Hospital (facility) coinsurance 10"
54-
"% Other coinsurance 10%</td><td>The plan's overall deductible Specialist copayment "
55-
r"Hospital (facility) coinsurance Other coinsurance</td><td>$750 $50 10% 10%</td><td>"
56-
"The plan's overall deductible Specialist copayment Hospital (facility) coinsurance "
57-
r'Other coinsurance</td><td>$750 $50 10% 10%</td></tr><tr><td colspan="2" rowspan="2">'
58-
"This EXAMPLE event includes services like: Specialist office visits (prenatal care) "
59-
"Childbirth/Delivery Professional Services Childbirth/Delivery Facility Services</td><"
60-
'td colspan="2" rowspan="2">This EXAMPLE event includes services like: Primary care '
61-
"physician office visits (including disease education) Diagnostic tests (blood work) "
62-
'Prescription drugs Durable medical equipment (glucose meter)</td><td colspan="2" '
63-
'rowspan="2">This EXAMPLE event includes services like: Emergency room care (including '
64-
"medical Diagnostic test (x-ray) Durable medical equipment (crutches) Rehabilitation "
65-
'services (physical therapy)</td></tr><tr><td colspan="2">Diagnostic tests ('
66-
"ultrasounds and blood work) Specialist visit (anesthesia)</td></tr><tr><td>Total "
67-
"Example Cost</td><td>$12,700</td><td>Total Example Cost</td><td>$5,600</td><td>Total "
68-
'Example Cost</td><td>$2,800</td></tr><tr><td colspan="2">In this example, Peg would '
69-
'pay:</td><td>In this example, Joe would pay:</td><td colspan="3">In this example, Mia '
70-
'would pay:</td></tr><tr><td>Cost Sharing</td><td></td><td colspan="2">Cost Sharing</td'
71-
"><td>Cost Sharing</td><td></td></tr><tr><td>Deductibles</td><td>$750</td><td>"
72-
"Deductibles</td><td>$120</td><td>Deductibles</td><td>$750</td></tr><tr><td>Copayments"
73-
'</td><td>$30</td><td>Copayments</td><td>$700</td><td colspan="2" rowspan="2">'
74-
'Copayments $400 Coinsurance $30</td></tr><tr><td colspan="2" rowspan="2">Coinsurance $'
75-
"1,200 What isn't covered</td><td>Coinsurance</td><td>$0</td></tr><tr><td>What isn't "
76-
"covered</td><td></td><td>What isn't covered</td><td></td></tr><tr><td>Limits or "
77-
"exclusions</td><td>$20</td><td>Limits or exclusions</td><td>$20</td><td>Limits or "
78-
"exclusions</td><td>$0</td></tr><tr><td>The total Peg would pay is</td><td>$2,000</td><"
79-
'td>The total Joe would pay is</td><td>$840</td><td colspan="2">The total Mia would '
80-
"pay is $1,180</td></tr><tr><td>Plan Name: NVIDIA PPO PlanPIan ID: 14603022</td><td></"
81-
"td><td>The plan would be responsible for the other costs of these EXAMPLE covered "
82-
"services</td><td></td><td></td><td>Page 8 of 8</td></tr></table>"
83-
)
84-
else:
85-
out = (
86-
'<table><thead><th colspan="6">About these Coverage Examples:</th></thead><thead><th>'
87-
"This is not a cost depending on the (deductibles, pay under different</th><th "
88-
'colspan="5">estimator. |reatments shown are just examples of how this plan might '
89-
"cover medical care. Your actual costs will be different actual care you receive, the "
90-
"prices your providers charge, and many other factors. Focus on the cost sharing "
91-
"amounts copayments and coinsurance) and excluded services under the plan. Use this "
92-
"information to compare the portion of costs you might health plans. Please note these "
93-
'coverage examples are based on self-only coverage.</th></thead><thead><th colspan="2">'
94-
"Peg is Having a Baby (9 months of in-network pre-natal care and a hospital delivery)</"
95-
"th><th>Managing Joe's type 2 (a year of routine in-network care controlled conaition"
96-
')</th><th>Diabetes of a well-</th><th colspan="2">Mia\'s Simple Fracture (in-network '
97-
'emergency room visit and follow up care)</th></thead><tr><td colspan="2">= The plan'
98-
"'s overall deductible $750 = Specialist copayment $50 = Hospital (facility) "
99-
"coinsurance 10% = Other coinsurance 10%</td><td>= The plan's overall deductible = "
100-
"Specialist copayment = Hospital (facility) coinsurance = Other coinsurance</td><td>$"
101-
r"750 $50 10% 10%</td><td>= The plan's overall deductible = Specialist copayment = "
102-
r"Hospital (facility) coinsurance = Other coinsurance</td><td>$750 $50 10% 10%</td></tr"
103-
'><tr><td colspan="2" rowspan="2">This EXAMPLE event includes services like: '
104-
"specialist office visits (prenatal care) Childbirth/Delivery Professional Services "
105-
'Childbirth/Delivery Facility Services</td><td colspan="2" rowspan="2">This EXAMPLE '
106-
"event includes services like: Primary care physician office visits (including aisease "
107-
"education) Diagnostic tests (b/ood work) Prescription drugs Durable medical equipment "
108-
'(/g/ucose meter)</td><td colspan="2" rowspan="2">This EXAMPLE event includes services '
109-
"like: Emergency room care (including meaical suoplies) Diagnostic test (x-ray) "
110-
"Durable medical equipment (crutches) Rehabilitation services (o/hysical therapy)</td"
111-
'></tr><tr><td colspan="2">Diagnostic tests (u/trasounas and blood work) specialist '
112-
"visit (anesthesia)</td></tr><tr><td>Total Example Cost</td><td>| $12,700</td><td>"
113-
"Total Example Cost |</td><td>$5,600</td><td>Total Example Cost</td><td>| $2,800</td></"
114-
'tr><tr><td colspan="2">In this example, Peg would pay:</td><td>In this example, Joe '
115-
'would pay:</td><td colspan="3">In this example, Mia would pay:</td></tr><tr><td>Cost '
116-
'Sharing</td><td></td><td colspan="2">Cost Sharing</td><td>Cost Sharing</td><td></td></'
117-
"tr><tr><td>Deductibles</td><td>$/50</td><td>Deductibles</td><td>$120</td><td>"
118-
"Deductibles</td><td>$/50</td></tr><tr><td>Copayments</td><td>$30</td><td>Copayments</"
119-
'td><td>$/00</td><td colspan="2" rowspan="2">Copayments $400 Coinsurance $30</td></tr><'
120-
'tr><td colspan="2" rowspan="2">Coinsurance $1,200 What isn t covered</td><td>'
121-
"Coinsurance</td><td></td></tr><tr><td>What isnt covered</td><td></td><td>What isnt "
122-
"covered</td><td></td></tr><tr><td>Limits or exclusions</td><td>$20</td><td>Limits or "
123-
"exclusions |</td><td>$20</td><td>Limits or exclusions</td><td></td></tr><tr><td>The "
124-
"total Peg would pay is</td><td>$2,000</td><td>The total Joe would pay is</td><td>9840"
125-
'</td><td colspan="2">The total Mia would pay is $1,180</td></tr><tr><td>Plan Name: '
126-
"NVIDIA PPO Plan</td><td>The plan would Plan ID: 14603022</td><td>be responsible for "
127-
"the other costs of these</td><td>EXAMPLE</td><td>covered services.</td><td>Page 8 of 8"
128-
"</td></tr></table>"
129-
)
130-
return out
131-
132-
13337
@pytest.mark.parametrize(
13438
("input_test", "output_test"),
13539
[
@@ -185,7 +89,12 @@ def test_nms(input_test, output_test):
18589
{
18690
"label": "table spanning cell",
18791
"score": 0.526617169380188,
188-
"bbox": [1446.2801513671875, 1023.817138671875, 2114.3525390625, 1099.20166015625],
92+
"bbox": [
93+
1446.2801513671875,
94+
1023.817138671875,
95+
2114.3525390625,
96+
1099.20166015625,
97+
],
18998
"projected row header": False,
19099
"header": False,
191100
"row_numbers": [3, 4],
@@ -210,7 +119,12 @@ def test_nms(input_test, output_test):
210119
{
211120
"label": "table spanning cell",
212121
"score": 0.526617169380188,
213-
"bbox": [1446.2801513671875, 1023.817138671875, 2114.3525390625, 1099.20166015625],
122+
"bbox": [
123+
1446.2801513671875,
124+
1023.817138671875,
125+
2114.3525390625,
126+
1099.20166015625,
127+
],
214128
"projected row header": False,
215129
"header": False,
216130
"row_numbers": [3, 4],
@@ -235,7 +149,12 @@ def test_nms(input_test, output_test):
235149
{
236150
"label": "table spanning cell",
237151
"score": 0.526617169380188,
238-
"bbox": [1446.2801513671875, 1023.817138671875, 2114.3525390625, 1099.20166015625],
152+
"bbox": [
153+
1446.2801513671875,
154+
1023.817138671875,
155+
2114.3525390625,
156+
1099.20166015625,
157+
],
239158
"projected row header": False,
240159
"header": False,
241160
"row_numbers": [3, 4],
@@ -260,7 +179,12 @@ def test_nms(input_test, output_test):
260179
{
261180
"label": "table spanning cell",
262181
"score": 0.526617169380188,
263-
"bbox": [1446.2801513671875, 1023.817138671875, 2114.3525390625, 1099.20166015625],
182+
"bbox": [
183+
1446.2801513671875,
184+
1023.817138671875,
185+
2114.3525390625,
186+
1099.20166015625,
187+
],
264188
"projected row header": False,
265189
"header": False,
266190
"row_numbers": [3, 4],
@@ -402,22 +326,35 @@ def test_align_rows(rows, bbox, output):
402326
assert postprocess.align_rows(rows, bbox) == output
403327

404328

329+
# TODO: break this test down so it doesn't account for nearly 8% of test coverage
405330
@pytest.mark.parametrize(
406331
("model_path", "platform_type"),
407332
[
408333
("microsoft/table-transformer-structure-recognition", "arm64"),
409334
("microsoft/table-transformer-structure-recognition", "x86_64"),
410335
],
411336
)
412-
def test_table_prediction(model_path, sample_table_transcript, platform_type):
337+
def test_table_prediction(model_path, platform_type):
413338
with patch("platform.machine", return_value=platform_type):
414339
table_model = tables.UnstructuredTableTransformerModel()
415340
from PIL import Image
416341

417342
table_model.initialize(model=model_path)
418-
img = Image.open("./sample-docs/example_table.jpg").convert("RGB")
343+
img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
419344
prediction = table_model.predict(img)
420-
assert prediction.strip() == sample_table_transcript.strip()
345+
# assert rows spans two rows are detected
346+
assert '<table><thead><th rowspan="2">' in prediction
347+
# one of the safest rows to detect should be present
348+
assert (
349+
"<tr>"
350+
"<td>Blind</td>"
351+
"<td>5</td>"
352+
"<td>1</td>"
353+
"<td>4</td>"
354+
"<td>34.5%, n=1</td>"
355+
"<td>1199 sec, n=1</td>"
356+
"</tr>"
357+
) in prediction
421358

422359

423360
def test_intersect():
@@ -551,18 +488,53 @@ def test_extract_text_from_spans(spans, join_with_space, expected):
551488
"column_numbers": [1, 2, 3],
552489
"score": 0.9,
553490
},
554-
{"header": "hi", "row_numbers": [1], "column_numbers": [1], "score": 0.9},
555-
{"header": "hi", "row_numbers": [1], "column_numbers": [2], "score": 0.9},
556-
{"header": "hi", "row_numbers": [1], "column_numbers": [3], "score": 0.9},
491+
{
492+
"header": "hi",
493+
"row_numbers": [1],
494+
"column_numbers": [1],
495+
"score": 0.9,
496+
},
497+
{
498+
"header": "hi",
499+
"row_numbers": [1],
500+
"column_numbers": [2],
501+
"score": 0.9,
502+
},
503+
{
504+
"header": "hi",
505+
"row_numbers": [1],
506+
"column_numbers": [3],
507+
"score": 0.9,
508+
},
557509
],
558510
4,
559511
),
560512
(
561513
[
562-
{"header": "hi", "row_numbers": [0], "column_numbers": [0], "score": 0.9},
563-
{"header": "hi", "row_numbers": [1], "column_numbers": [0], "score": 0.9},
564-
{"header": "hi", "row_numbers": [1, 2], "column_numbers": [0], "score": 0.9},
565-
{"header": "hi", "row_numbers": [3], "column_numbers": [0], "score": 0.9},
514+
{
515+
"header": "hi",
516+
"row_numbers": [0],
517+
"column_numbers": [0],
518+
"score": 0.9,
519+
},
520+
{
521+
"header": "hi",
522+
"row_numbers": [1],
523+
"column_numbers": [0],
524+
"score": 0.9,
525+
},
526+
{
527+
"header": "hi",
528+
"row_numbers": [1, 2],
529+
"column_numbers": [0],
530+
"score": 0.9,
531+
},
532+
{
533+
"header": "hi",
534+
"row_numbers": [3],
535+
"column_numbers": [0],
536+
"score": 0.9,
537+
},
566538
],
567539
3,
568540
),
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.23" # pragma: no cover
1+
__version__ = "0.5.24" # pragma: no cover

unstructured_inference/models/tables.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from pathlib import Path
88
from typing import Optional, Union
99

10-
import cv2
1110
import numpy as np
1211
import pandas as pd
1312
import pytesseract
@@ -78,21 +77,9 @@ def get_tokens(self, x: Image):
7877
"No module named 'unstructured_paddleocr', falling back to tesseract",
7978
)
8079
pass
81-
zoom = 6
82-
img = cv2.resize(
83-
cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR),
84-
None,
85-
fx=zoom,
86-
fy=zoom,
87-
interpolation=cv2.INTER_CUBIC,
88-
)
89-
90-
kernel = np.ones((1, 1), np.uint8)
91-
img = cv2.dilate(img, kernel, iterations=1)
92-
img = cv2.erode(img, kernel, iterations=1)
9380

9481
ocr_df: pd.DataFrame = pytesseract.image_to_data(
95-
Image.fromarray(img),
82+
x,
9683
output_type="data.frame",
9784
)
9885

@@ -103,10 +90,10 @@ def get_tokens(self, x: Image):
10390
tokens.append(
10491
{
10592
"bbox": [
106-
idtx.left / zoom,
107-
idtx.top / zoom,
108-
(idtx.left + idtx.width) / zoom,
109-
(idtx.top + idtx.height) / zoom,
93+
idtx.left,
94+
idtx.top,
95+
idtx.left + idtx.width,
96+
idtx.top + idtx.height,
11097
],
11198
"text": idtx.text,
11299
},

0 commit comments

Comments
 (0)