Skip to content

Commit 66590e3

Browse files
authored
enhancement: expose dpi (#151)
Exposed pdf-to-image conversion dpi parameter to higher level functions so this parameter can be set externally.
1 parent ffed03b commit 66590e3

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.5.6-dev0
1+
## 0.5.6-dev1
22

33
* Warns users that Chipper is a beta model.
4+
* Exposed control over dpi when converting PDF to an image.
45

56
## 0.5.5
67

test_unstructured_inference/inference/test_layout.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,4 +739,12 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
739739
ocr_languages="eng",
740740
fixed_layouts=None,
741741
extract_tables=False,
742+
pdf_image_dpi=200,
742743
)
744+
745+
746+
@pytest.mark.parametrize(("pdf_image_dpi", "expected"), [(200, 2200), (100, 1100)])
747+
def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
748+
with patch.object(layout.PageLayout, "from_image") as mock_from_image:
749+
layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf", pdf_image_dpi=pdf_image_dpi)
750+
assert mock_from_image.call_args[0][0].height == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.6-dev0" # pragma: no cover
1+
__version__ = "0.5.6-dev1" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,11 @@ def from_file(
7777
ocr_strategy: str = "auto",
7878
ocr_languages: str = "eng",
7979
extract_tables: bool = False,
80+
pdf_image_dpi: int = 200,
8081
) -> DocumentLayout:
8182
"""Creates a DocumentLayout from a pdf file."""
8283
logger.info(f"Reading PDF for file: {filename} ...")
83-
layouts, images = load_pdf(filename)
84+
layouts, images = load_pdf(filename, pdf_image_dpi)
8485
if len(layouts) > len(images):
8586
raise RuntimeError(
8687
"Some images were not loaded. Check that poppler is installed and in your $PATH.",
@@ -297,6 +298,7 @@ def process_data_with_model(
297298
ocr_languages: str = "eng",
298299
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
299300
extract_tables: bool = False,
301+
pdf_image_dpi: int = 200,
300302
) -> DocumentLayout:
301303
"""Processes pdf file in the form of a file handler (supporting a read method) into a
302304
DocumentLayout by using a model identified by model_name."""
@@ -310,6 +312,7 @@ def process_data_with_model(
310312
ocr_languages=ocr_languages,
311313
fixed_layouts=fixed_layouts,
312314
extract_tables=extract_tables,
315+
pdf_image_dpi=pdf_image_dpi,
313316
)
314317

315318
return layout
@@ -323,6 +326,7 @@ def process_file_with_model(
323326
ocr_languages: str = "eng",
324327
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
325328
extract_tables: bool = False,
329+
pdf_image_dpi: int = 200,
326330
) -> DocumentLayout:
327331
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
328332
model_name."""
@@ -353,6 +357,7 @@ def process_file_with_model(
353357
ocr_languages=ocr_languages,
354358
fixed_layouts=fixed_layouts,
355359
extract_tables=extract_tables,
360+
pdf_image_dpi=pdf_image_dpi,
356361
)
357362
)
358363
return layout

0 commit comments

Comments
 (0)