Skip to content

Commit 75eef77

Browse files
Chore: add image table support (#167)
* version and changelog * stage test * version bump * stage notbook * pip compile * add skip support param * OCD refactor parameters * add one more todo * note ticket number * nvm no need a ticket * Revert "OCD refactor parameters" This reverts commit 3d66c6a. * json looks good need to remove field for csv * Revert "Revert "OCD refactor parameters"" This reverts commit ddd953b. * sus notebook output * okay to have file * use index * unit test for param * readme * note nit * no need to add all param in parallel test * missing dot * new make pip compile * Update test_general/api/test_app.py Co-authored-by: shreyanid <[email protected]> * pass empty param in readme example * lets bump to a new version --------- Co-authored-by: shreyanid <[email protected]>
1 parent d751633 commit 75eef77

File tree

9 files changed

+220
-158
lines changed

9 files changed

+220
-158
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
## 0.0.34-dev0
2+
3+
* Add table support for image with parameter `skip_infer_table_types`
14
## 0.0.33
25

36
* Image tweak, move application entrypoint to scripts/app-start.sh

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,22 @@ To extract the table structure from PDF files using the `hi_res` strategy, ensur
111111
| jq -C . | less -R
112112
```
113113

114+
#### Skip Table Extraction
115+
116+
Currently, we provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter `skip_infer_table_types` to specify the document types that you want to skip table extraction with. By default, we skip table extraction
117+
for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that table extraction only works with `hi_res` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to `skip_infer_table_types`with:
118+
119+
```
120+
curl -X 'POST' \
121+
'https://api.unstructured.io/general/v0/general' \
122+
-H 'accept: application/json' \
123+
-H 'Content-Type: multipart/form-data' \
124+
-F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \
125+
-F 'strategy=hi_res' \
126+
-F 'skip_infer_table_types=' \
127+
| jq -C . | less -R
128+
```
129+
114130
#### Encoding
115131

116132
You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 73 additions & 71 deletions
Large diffs are not rendered by default.

prepline_general/api/general.py

Lines changed: 59 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -198,33 +198,35 @@ def pipeline_api(
198198
file,
199199
request=None,
200200
filename="",
201-
m_strategy=[],
201+
file_content_type=None,
202+
response_type="application/json",
202203
m_coordinates=[],
203-
m_ocr_languages=[],
204-
m_include_page_breaks=[],
205204
m_encoding=[],
206-
m_xml_keep_tags=[],
207-
m_pdf_infer_table_structure=[],
208205
m_hi_res_model_name=[],
209-
file_content_type=None,
210-
response_type="application/json",
206+
m_include_page_breaks=[],
207+
m_ocr_languages=[],
208+
m_pdf_infer_table_structure=[],
209+
m_skip_infer_table_types=[],
210+
m_strategy=[],
211+
m_xml_keep_tags=[],
211212
):
212213
logger.debug(
213214
"pipeline_api input params: {}".format(
214215
json.dumps(
215216
{
216217
"request": request,
217218
"filename": filename,
218-
"m_strategy": m_strategy,
219+
"file_content_type": file_content_type,
220+
"response_type": response_type,
219221
"m_coordinates": m_coordinates,
220-
"m_ocr_languages": m_ocr_languages,
221-
"m_include_page_breaks": m_include_page_breaks,
222222
"m_encoding": m_encoding,
223-
"m_xml_keep_tags": m_xml_keep_tags,
224-
"m_pdf_infer_table_structure": m_pdf_infer_table_structure,
225223
"m_hi_res_model_name": m_hi_res_model_name,
226-
"file_content_type": file_content_type,
227-
"response_type": response_type,
224+
"m_include_page_breaks": m_include_page_breaks,
225+
"m_ocr_languages": m_ocr_languages,
226+
"m_pdf_infer_table_structure": m_pdf_infer_table_structure,
227+
"m_skip_infer_table_types": m_skip_infer_table_types,
228+
"m_strategy": m_strategy,
229+
"m_xml_keep_tags": m_xml_keep_tags,
228230
},
229231
default=str,
230232
)
@@ -290,6 +292,10 @@ def pipeline_api(
290292
else:
291293
pdf_infer_table_structure = False
292294

295+
skip_infer_table_types = (
296+
m_skip_infer_table_types if len(m_skip_infer_table_types) else ["pdf", "jpg", "png"]
297+
)
298+
293299
try:
294300
logger.debug(
295301
"partition input data: {}".format(
@@ -304,6 +310,7 @@ def pipeline_api(
304310
"encoding": encoding,
305311
"model_name": hi_res_model_name,
306312
"xml_keep_tags": xml_keep_tags,
313+
"skip_infer_table_types": skip_infer_table_types,
307314
},
308315
default=str,
309316
)
@@ -317,26 +324,31 @@ def pipeline_api(
317324
file=file,
318325
file_filename=filename,
319326
content_type=file_content_type,
320-
strategy=strategy,
321-
ocr_languages=ocr_languages,
322327
coordinates=show_coordinates,
323-
pdf_infer_table_structure=pdf_infer_table_structure,
324-
include_page_breaks=include_page_breaks,
328+
# partition_kwargs
325329
encoding=encoding,
330+
include_page_breaks=include_page_breaks,
326331
model_name=hi_res_model_name,
332+
ocr_languages=ocr_languages,
333+
pdf_infer_table_structure=pdf_infer_table_structure,
334+
skip_infer_table_types=skip_infer_table_types,
335+
strategy=strategy,
336+
xml_keep_tags=xml_keep_tags,
327337
)
328338
else:
329339
elements = partition(
330340
file=file,
331341
file_filename=filename,
332342
content_type=file_content_type,
333-
strategy=strategy,
343+
# partition_kwargs
344+
encoding=encoding,
345+
include_page_breaks=include_page_breaks,
346+
model_name=hi_res_model_name,
334347
ocr_languages=ocr_languages,
335348
pdf_infer_table_structure=pdf_infer_table_structure,
336-
include_page_breaks=include_page_breaks,
337-
encoding=encoding,
349+
skip_infer_table_types=skip_infer_table_types,
350+
strategy=strategy,
338351
xml_keep_tags=xml_keep_tags,
339-
model_name=hi_res_model_name,
340352
)
341353
except ValueError as e:
342354
if "Invalid file" in e.args[0]:
@@ -345,31 +357,23 @@ def pipeline_api(
345357
)
346358
raise e
347359

360+
# Clean up returned elements
361+
for i, element in enumerate(elements):
362+
elements[i].metadata.filename = os.path.basename(filename)
363+
364+
if not show_coordinates and element.metadata.coordinates:
365+
elements[i].metadata.coordinates = None
366+
367+
# Note(yuming): currently removing date from metadata
368+
# since it should be fixed in the core library
369+
if element.metadata.date:
370+
elements[i].metadata.date = None
371+
348372
if response_type == "text/csv":
349373
df = convert_to_dataframe(elements)
350-
df["filename"] = os.path.basename(filename)
351-
if not show_coordinates:
352-
columns_to_drop = [
353-
col
354-
for col in [
355-
"coordinates_points",
356-
"coordinates_system",
357-
"coordinates_layout_width",
358-
"coordinates_layout_height",
359-
]
360-
if col in df.columns
361-
]
362-
if columns_to_drop:
363-
df.drop(columns=columns_to_drop, inplace=True)
364-
365374
return df.to_csv(index=False)
366375

367376
result = convert_to_isd(elements)
368-
for element in result:
369-
element["metadata"]["filename"] = os.path.basename(filename)
370-
371-
if not show_coordinates and "coordinates" in element["metadata"]:
372-
del element["metadata"]["coordinates"]
373377

374378
return result
375379

@@ -479,20 +483,21 @@ def return_content_type(filename):
479483

480484

481485
@router.post("/general/v0/general")
482-
@router.post("/general/v0.0.33/general")
486+
@router.post("/general/v0.0.34/general")
483487
def pipeline_1(
484488
request: Request,
485489
gz_uncompressed_content_type: Optional[str] = Form(default=None),
486490
files: Union[List[UploadFile], None] = File(default=None),
487491
output_format: Union[str, None] = Form(default=None),
488-
strategy: List[str] = Form(default=[]),
489492
coordinates: List[str] = Form(default=[]),
490-
ocr_languages: List[str] = Form(default=[]),
491-
include_page_breaks: List[str] = Form(default=[]),
492493
encoding: List[str] = Form(default=[]),
493-
xml_keep_tags: List[str] = Form(default=[]),
494-
pdf_infer_table_structure: List[str] = Form(default=[]),
495494
hi_res_model_name: List[str] = Form(default=[]),
495+
include_page_breaks: List[str] = Form(default=[]),
496+
ocr_languages: List[str] = Form(default=[]),
497+
pdf_infer_table_structure: List[str] = Form(default=[]),
498+
skip_infer_table_types: List[str] = Form(default=[]),
499+
strategy: List[str] = Form(default=[]),
500+
xml_keep_tags: List[str] = Form(default=[]),
496501
):
497502
if files:
498503
for file_index in range(len(files)):
@@ -532,14 +537,15 @@ def response_generator(is_multipart):
532537
response = pipeline_api(
533538
_file,
534539
request=request,
535-
m_strategy=strategy,
536540
m_coordinates=coordinates,
537-
m_ocr_languages=ocr_languages,
538-
m_include_page_breaks=include_page_breaks,
539541
m_encoding=encoding,
540-
m_xml_keep_tags=xml_keep_tags,
541-
m_pdf_infer_table_structure=pdf_infer_table_structure,
542542
m_hi_res_model_name=hi_res_model_name,
543+
m_include_page_breaks=include_page_breaks,
544+
m_ocr_languages=ocr_languages,
545+
m_pdf_infer_table_structure=pdf_infer_table_structure,
546+
m_skip_infer_table_types=skip_infer_table_types,
547+
m_strategy=strategy,
548+
m_xml_keep_tags=xml_keep_tags,
543549
response_type=media_type,
544550
filename=file.filename,
545551
file_content_type=file_content_type,

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.33
2+
version: 0.0.34

requirements/base.txt

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ et-xmlfile==1.1.0
5252
# via openpyxl
5353
exceptiongroup==1.1.2
5454
# via anyio
55-
fastapi==0.100.0
55+
fastapi==0.100.1
5656
# via unstructured-api-tools
5757
fastjsonschema==2.18.0
5858
# via nbformat
@@ -126,7 +126,7 @@ lxml==4.9.3
126126
# python-docx
127127
# python-pptx
128128
# unstructured
129-
markdown==3.4.3
129+
markdown==3.4.4
130130
# via unstructured
131131
markupsafe==2.1.3
132132
# via
@@ -146,9 +146,9 @@ mypy-extensions==1.0.0
146146
# via mypy
147147
nbclient==0.8.0
148148
# via nbconvert
149-
nbconvert==7.7.2
149+
nbconvert==7.7.3
150150
# via unstructured-api-tools
151-
nbformat==5.9.1
151+
nbformat==5.9.2
152152
# via
153153
# nbclient
154154
# nbconvert
@@ -203,7 +203,7 @@ pdfminer-six==20221105
203203
# via
204204
# pdfplumber
205205
# unstructured
206-
pdfplumber==0.10.1
206+
pdfplumber==0.10.2
207207
# via layoutparser
208208
pillow==10.0.0
209209
# via
@@ -217,7 +217,7 @@ pillow==10.0.0
217217
# unstructured
218218
pkgutil-resolve-name==1.3.10
219219
# via jsonschema
220-
platformdirs==3.9.1
220+
platformdirs==3.10.0
221221
# via jupyter-core
222222
portalocker==2.7.0
223223
# via iopath
@@ -229,19 +229,19 @@ pycparser==2.21
229229
# via cffi
230230
pycryptodome==3.18.0
231231
# via -r requirements/base.in
232-
pydantic==1.10.11
232+
pydantic==1.10.12
233233
# via
234234
# -r requirements/base.in
235235
# fastapi
236-
pyflakes==3.0.1
236+
pyflakes==3.1.0
237237
# via autoflake
238238
pygments==2.15.1
239239
# via nbconvert
240240
pypandoc==1.11
241241
# via unstructured
242242
pyparsing==3.0.9
243243
# via matplotlib
244-
pypdf==3.13.0
244+
pypdf==3.14.0
245245
# via -r requirements/base.in
246246
pypdfium2==4.18.0
247247
# via pdfplumber
@@ -365,26 +365,28 @@ types-urllib3==1.26.25.14
365365
# via types-requests
366366
typing-extensions==4.7.1
367367
# via
368+
# annotated-types
368369
# fastapi
369370
# huggingface-hub
370371
# iopath
371372
# mypy
372373
# pydantic
374+
# pydantic-core
373375
# pypdf
374376
# starlette
375377
# torch
376378
# uvicorn
377379
tzdata==2023.3
378380
# via pandas
379-
unstructured[local-inference]==0.8.1
381+
unstructured[local-inference]==0.8.7
380382
# via -r requirements/base.in
381383
unstructured-api-tools==0.10.10
382384
# via -r requirements/base.in
383-
unstructured-inference==0.5.5
385+
unstructured-inference==0.5.7
384386
# via unstructured
385387
urllib3==2.0.4
386388
# via requests
387-
uvicorn[standard]==0.23.1
389+
uvicorn[standard]==0.23.2
388390
# via unstructured-api-tools
389391
uvloop==0.17.0
390392
# via uvicorn

0 commit comments

Comments
 (0)