Skip to content

Commit 5e11501

Browse files
authored
Add optional param for model name when partitioning pdfs (#890)
* Add optional param for model name when partitioning pdfs * Pull in latest inference changes * Fix linting
1 parent 47bc400 commit 5e11501

File tree

7 files changed

+42
-7
lines changed

7 files changed

+42
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
## 0.8.0-dev2
1+
## 0.8.0
22

33
### Enhancements
44

5+
* Allow model used for hi res pdf partition strategy to be chosen when called.
6+
* Updated inference package
7+
58
### Features
69

710
* Add metadata_filename parameter across all partition functions

requirements/local-inference.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
-c constraints.in
22
-c base.txt
3-
unstructured-inference==0.5.4
3+
unstructured-inference==0.5.5

requirements/local-inference.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ typing-extensions==4.7.0
211211
# huggingface-hub
212212
# iopath
213213
# torch
214-
unstructured-inference==0.5.4
214+
unstructured-inference==0.5.5
215215
# via -r requirements/local-inference.in
216216
urllib3==1.26.16
217217
# via

test_unstructured/partition/test_pdf.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_partition_pdf_with_spooled_file(
129129

130130

131131
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
132-
def test_partition_pdf_with_model_name(
132+
def test_partition_pdf_with_model_name_env_var(
133133
monkeypatch,
134134
filename="example-docs/layout-parser-paper-fast.pdf",
135135
):
@@ -152,6 +152,26 @@ def test_partition_pdf_with_model_name(
152152
)
153153

154154

155+
def test_partition_pdf_with_model_name(
156+
monkeypatch,
157+
filename="example-docs/layout-parser-paper-fast.pdf",
158+
):
159+
monkeypatch.setattr(
160+
strategies,
161+
"is_pdf_text_extractable",
162+
lambda *args, **kwargs: True,
163+
)
164+
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
165+
pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox")
166+
mock_process.assert_called_once_with(
167+
filename,
168+
is_image=False,
169+
ocr_languages="eng",
170+
extract_tables=False,
171+
model_name="checkbox",
172+
)
173+
174+
155175
def test_partition_pdf_with_auto_strategy(
156176
filename="example-docs/layout-parser-paper-fast.pdf",
157177
):

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.0-dev2" # pragma: no cover
1+
__version__ = "0.8.0" # pragma: no cover

unstructured/ingest/doc_processor/generalized.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Process aribritrary files with the Unstructured library"""
1+
"""Process arbitrary files with the Unstructured library"""
22

33
import os
44
from typing import Any, Dict, List, Optional
@@ -12,6 +12,13 @@
1212
def initialize():
1313
"""Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment
1414
variable (avoids subprocesses all doing the same)"""
15+
16+
# If more than one model will be supported and left up to user selection
17+
supported_model = os.environ.get("UNSTRUCTURED_HI_RES_SUPPORTED_MODEL", "")
18+
if supported_model:
19+
for model_name in supported_model.split(","):
20+
get_model(model_name=model_name)
21+
1522
get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME"))
1623

1724

unstructured/partition/pdf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def partition_pdf(
9090
infer_table_structure=infer_table_structure,
9191
ocr_languages=ocr_languages,
9292
max_partition=max_partition,
93+
**kwargs,
9394
)
9495

9596

@@ -102,6 +103,7 @@ def partition_pdf_or_image(
102103
infer_table_structure: bool = False,
103104
ocr_languages: str = "eng",
104105
max_partition: Optional[int] = 1500,
106+
**kwargs,
105107
) -> List[Element]:
106108
"""Parses a pdf or image document into a list of interpreted elements."""
107109
# TODO(alan): Extract information about the filetype to be processed from the template
@@ -128,6 +130,7 @@ def partition_pdf_or_image(
128130
infer_table_structure=infer_table_structure,
129131
include_page_breaks=include_page_breaks,
130132
ocr_languages=ocr_languages,
133+
**kwargs,
131134
)
132135

133136
elif strategy == "fast":
@@ -160,6 +163,8 @@ def _partition_pdf_or_image_local(
160163
infer_table_structure: bool = False,
161164
include_page_breaks: bool = False,
162165
ocr_languages: str = "eng",
166+
model_name: Optional[str] = None,
167+
**kwargs,
163168
) -> List[Element]:
164169
"""Partition using package installed locally."""
165170
try:
@@ -182,7 +187,7 @@ def _partition_pdf_or_image_local(
182187
"running make install-local-inference from the root directory of the repository.",
183188
) from e
184189

185-
model_name = os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
190+
model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
186191
if file is None:
187192
layout = process_file_with_model(
188193
filename,

0 commit comments

Comments
 (0)