Skip to content

Commit 15d4e66

Browse files
authored
Feat: add pdf_infer_table_structure parameter to api (#139)
* change and tidy notebook * changelog version bump * gernate api * friendly input * adds test * stick with valid input * move param location * Revert "adds test" This reverts commit 5d8296a. * Revert "change and tidy notebook" This reverts commit 56e2bf1. * move to unittest * add readme * Revert "move to unittest" This reverts commit 4319718. * Revert "Revert "adds test"" This reverts commit 54b55ab. * stage * bump ust 0.7.10->11 * ah readme * tidy * remove content type param * Revert "remove content type param" This reverts commit 74d9040. * note to content type
1 parent 611f3ba commit 15d4e66

File tree

7 files changed

+101
-12
lines changed

7 files changed

+101
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
## 0.0.30
22

3+
* Add table extraction support for hi_res strategy
34
* Add support for `encoding` parameter
45
* Add support for `xml_keep_tags` parameter
56
* Add env variables for additional parallel mode tweaking

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,21 @@ When elements are extracted from PDFs or images, it may be useful to get their b
9393
| jq -C . | less -R
9494
```
9595

96+
#### PDF Table Extraction
97+
98+
To extract the table structure from PDF files using the `hi_res` strategy, ensure that the `pdf_infer_table_structure` parameter is set to `true`. This setting includes the table's text content in the response. By default, this parameter is set to `false` to avoid the expensive reading process.
99+
100+
```
101+
curl -X 'POST' \
102+
'https://api.unstructured.io/general/v0/general' \
103+
-H 'accept: application/json' \
104+
-H 'Content-Type: multipart/form-data' \
105+
-F 'files=@sample-docs/layout-parser-paper.pdf' \
106+
-F 'strategy=hi_res' \
107+
-F 'pdf_infer_table_structure=true' \
108+
| jq -C . | less -R
109+
```
110+
96111
#### Encoding
97112

98113
You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,7 @@
726726
" m_ocr_languages=[],\n",
727727
" m_encoding=[],\n",
728728
" m_xml_keep_tags=[],\n",
729+
" m_pdf_infer_table_structure = [],\n",
729730
" file_content_type=None,\n",
730731
" response_type=\"application/json\"\n",
731732
"):\n",
@@ -756,6 +757,14 @@
756757
" xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else \"false\").lower()\n",
757758
" xml_keep_tags = xml_keep_tags_str == \"true\"\n",
758759
" \n",
760+
" pdf_infer_table_structure = (\n",
761+
" m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else \"false\"\n",
762+
" ).lower()\n",
763+
" if strategy == \"hi_res\" and pdf_infer_table_structure == \"true\":\n",
764+
" pdf_infer_table_structure = True\n",
765+
" else:\n",
766+
" pdf_infer_table_structure = False\n",
767+
" \n",
759768
" try:\n",
760769
" if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n",
761770
" elements = partition_pdf_splits(\n",
@@ -766,6 +775,7 @@
766775
" strategy=strategy,\n",
767776
" ocr_languages=ocr_languages,\n",
768777
" coordinates=show_coordinates,\n",
778+
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
769779
" encoding=encoding,\n",
770780
" )\n",
771781
" else:\n",
@@ -775,6 +785,7 @@
775785
" content_type=file_content_type,\n",
776786
" strategy=strategy,\n",
777787
" ocr_languages=ocr_languages,\n",
788+
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
778789
" encoding=encoding,\n",
779790
" xml_keep_tags=xml_keep_tags,\n",
780791
" )\n",

prepline_general/api/general.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def pipeline_api(
184184
m_ocr_languages=[],
185185
m_encoding=[],
186186
m_xml_keep_tags=[],
187+
m_pdf_infer_table_structure=[],
187188
file_content_type=None,
188189
response_type="application/json",
189190
):
@@ -213,6 +214,14 @@ def pipeline_api(
213214
xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else "false").lower()
214215
xml_keep_tags = xml_keep_tags_str == "true"
215216

217+
pdf_infer_table_structure = (
218+
m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else "false"
219+
).lower()
220+
if strategy == "hi_res" and pdf_infer_table_structure == "true":
221+
pdf_infer_table_structure = True
222+
else:
223+
pdf_infer_table_structure = False
224+
216225
try:
217226
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
218227
elements = partition_pdf_splits(
@@ -223,6 +232,7 @@ def pipeline_api(
223232
strategy=strategy,
224233
ocr_languages=ocr_languages,
225234
coordinates=show_coordinates,
235+
pdf_infer_table_structure=pdf_infer_table_structure,
226236
encoding=encoding,
227237
)
228238
else:
@@ -232,6 +242,7 @@ def pipeline_api(
232242
content_type=file_content_type,
233243
strategy=strategy,
234244
ocr_languages=ocr_languages,
245+
pdf_infer_table_structure=pdf_infer_table_structure,
235246
encoding=encoding,
236247
xml_keep_tags=xml_keep_tags,
237248
)
@@ -378,6 +389,7 @@ def pipeline_1(
378389
ocr_languages: List[str] = Form(default=[]),
379390
encoding: List[str] = Form(default=[]),
380391
xml_keep_tags: List[str] = Form(default=[]),
392+
pdf_infer_table_structure: List[str] = Form(default=[]),
381393
):
382394
if files:
383395
for file_index in range(len(files)):
@@ -417,6 +429,7 @@ def response_generator(is_multipart):
417429
m_ocr_languages=ocr_languages,
418430
m_encoding=encoding,
419431
m_xml_keep_tags=xml_keep_tags,
432+
m_pdf_infer_table_structure=pdf_infer_table_structure,
420433
response_type=media_type,
421434
filename=file.filename,
422435
file_content_type=file_content_type,

requirements/base.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ et-xmlfile==1.1.0
6060
# via openpyxl
6161
exceptiongroup==1.1.1
6262
# via anyio
63-
fastapi==0.98.0
63+
fastapi==0.99.0
6464
# via unstructured-api-tools
6565
fastjsonschema==2.17.1
6666
# via nbformat
@@ -189,7 +189,7 @@ omegaconf==2.3.0
189189
# via effdet
190190
onnxruntime==1.15.1
191191
# via unstructured-inference
192-
opencv-python==4.7.0.72
192+
opencv-python==4.8.0.74
193193
# via
194194
# layoutparser
195195
# unstructured-inference
@@ -243,7 +243,7 @@ pycocotools==2.0.6
243243
# via effdet
244244
pycparser==2.21
245245
# via cffi
246-
pydantic==1.10.9
246+
pydantic==1.10.10
247247
# via
248248
# argilla
249249
# fastapi
@@ -379,6 +379,7 @@ types-urllib3==1.26.25.13
379379
# via types-requests
380380
typing-extensions==4.7.0
381381
# via
382+
# fastapi
382383
# huggingface-hub
383384
# iopath
384385
# mypy
@@ -387,11 +388,11 @@ typing-extensions==4.7.0
387388
# rich
388389
# starlette
389390
# torch
390-
unstructured[local-inference]==0.7.10
391+
unstructured[local-inference]==0.7.11
391392
# via -r requirements/base.in
392393
unstructured-api-tools==0.10.7
393394
# via -r requirements/base.in
394-
unstructured-inference==0.5.1
395+
unstructured-inference==0.5.4
395396
# via unstructured
396397
urllib3==2.0.3
397398
# via requests

requirements/test.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ execnb==0.1.5
140140
# via nbdev
141141
executing==1.2.0
142142
# via stack-data
143-
fastapi==0.98.0
143+
fastapi==0.99.0
144144
# via
145145
# -r requirements/base.txt
146146
# unstructured-api-tools
@@ -444,7 +444,7 @@ onnxruntime==1.15.1
444444
# via
445445
# -r requirements/base.txt
446446
# unstructured-inference
447-
opencv-python==4.7.0.72
447+
opencv-python==4.8.0.74
448448
# via
449449
# -r requirements/base.txt
450450
# layoutparser
@@ -562,7 +562,7 @@ pycparser==2.21
562562
# via
563563
# -r requirements/base.txt
564564
# cffi
565-
pydantic==1.10.9
565+
pydantic==1.10.10
566566
# via
567567
# -r requirements/base.txt
568568
# argilla
@@ -832,6 +832,7 @@ typing-extensions==4.7.0
832832
# via
833833
# -r requirements/base.txt
834834
# black
835+
# fastapi
835836
# huggingface-hub
836837
# iopath
837838
# ipython
@@ -841,11 +842,11 @@ typing-extensions==4.7.0
841842
# rich
842843
# starlette
843844
# torch
844-
unstructured[local-inference]==0.7.10
845+
unstructured[local-inference]==0.7.11
845846
# via -r requirements/base.txt
846847
unstructured-api-tools==0.10.7
847848
# via -r requirements/base.txt
848-
unstructured-inference==0.5.1
849+
unstructured-inference==0.5.4
849850
# via
850851
# -r requirements/base.txt
851852
# unstructured

scripts/smoketest.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,23 @@
1313
skip_inference_tests = os.getenv("SKIP_INFERENCE_TESTS", "").lower() in {"true", "yes", "y", "1"}
1414

1515

16-
def send_document(filename, content_type, strategy="fast", output_format="application/json"):
16+
def send_document(
17+
filename,
18+
content_type,
19+
strategy="fast",
20+
output_format="application/json",
21+
pdf_infer_table_structure="false",
22+
):
23+
# Note: `content_type` is not passed into request since fast API will overwrite it.
1724
files = {"files": (str(filename), open(filename, "rb"))}
1825
return requests.post(
19-
API_URL, files=files, data={"strategy": strategy, "output_format": output_format}
26+
API_URL,
27+
files=files,
28+
data={
29+
"strategy": strategy,
30+
"output_format": output_format,
31+
"pdf_infer_table_structure": pdf_infer_table_structure,
32+
},
2033
)
2134

2235

@@ -107,3 +120,37 @@ def test_strategy_performance():
107120
assert response.status_code == 200
108121

109122
assert hi_res_time > performance_ratio * fast_time
123+
124+
125+
@pytest.mark.skipif(skip_inference_tests, reason="emulated architecture")
126+
@pytest.mark.parametrize(
127+
"strategy, pdf_infer_table_structure, expected_table_num",
128+
[
129+
("fast", "True", 0),
130+
("fast", "False", 0),
131+
("hi_res", "True", 2),
132+
("hi_res", "False", 0),
133+
],
134+
)
135+
def test_table_support(strategy, pdf_infer_table_structure, expected_table_num):
136+
"""
137+
Test that table extraction works on hi_res strategy
138+
"""
139+
test_file = Path("sample-docs") / "layout-parser-paper.pdf"
140+
response = send_document(
141+
test_file,
142+
"application/pdf",
143+
strategy=strategy,
144+
pdf_infer_table_structure=pdf_infer_table_structure,
145+
)
146+
147+
assert response.status_code == 200
148+
extracted_tables = [
149+
el["metadata"]["text_as_html"]
150+
for el in response.json()
151+
if "text_as_html" in el["metadata"].keys()
152+
]
153+
assert len(extracted_tables) == expected_table_num
154+
if expected_table_num > 0:
155+
# Test a text form a table is extracted
156+
assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0]

0 commit comments

Comments
 (0)