Skip to content

Commit ff96a97

Browse files
authored
fix: store page no (#118)
Fixes page numbers not being correctly stored in the PageLayout objects.
1 parent 4dcbd9b commit ff96a97

File tree

4 files changed

+16
-4
lines changed

4 files changed

+16
-4
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.5.1-dev0
1+
## 0.5.1-dev1
22

3+
* Store page numbers when processing PDFs
34
* Hotfix to handle inference of blank pages using ONNX detectron2
45

56
## 0.5.0

test_unstructured_inference/inference/test_layout.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,15 @@ def test_get_elements_from_layout(mock_page_layout, idx):
308308
assert elements[0].text == block.text
309309

310310

311+
def test_page_numbers_in_page_objects():
312+
with patch(
313+
"unstructured_inference.inference.layout.PageLayout.get_elements_with_model"
314+
) as mock_get_elements:
315+
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
316+
mock_get_elements.assert_called()
317+
assert [page.number for page in doc.pages] == list(range(1, len(doc.pages) + 1))
318+
319+
311320
@pytest.mark.parametrize(
312321
"fixed_layouts, called_method, not_called_method",
313322
[
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.1-dev0" # pragma: no cover
1+
__version__ = "0.5.1-dev1" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,12 @@ def from_file(
7676
pages: List[PageLayout] = list()
7777
if fixed_layouts is None:
7878
fixed_layouts = [None for _ in layouts]
79-
for image, layout, fixed_layout in zip(images, layouts, fixed_layouts):
79+
for i, (image, layout, fixed_layout) in enumerate(zip(images, layouts, fixed_layouts)):
8080
# NOTE(robinson) - In the future, maybe we detect the page number and default
8181
# to the index if it is not detected
8282
page = PageLayout.from_image(
8383
image,
84+
number=i + 1,
8485
model=model,
8586
layout=layout,
8687
ocr_strategy=ocr_strategy,
@@ -195,6 +196,7 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
195196
def from_image(
196197
cls,
197198
image,
199+
number=1,
198200
model: Optional[UnstructuredModel] = None,
199201
layout: Optional[List[TextRegion]] = None,
200202
ocr_strategy: str = "auto",
@@ -204,7 +206,7 @@ def from_image(
204206
):
205207
"""Creates a PageLayout from an already-loaded PIL Image."""
206208
page = cls(
207-
number=0,
209+
number=number,
208210
image=image,
209211
layout=layout,
210212
model=model,

0 commit comments

Comments
 (0)