Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.8.8-dev0
## 0.8.8-dev1

* fix: pdfminer-six dependencies
* feat: `PageLayout.elements` is now a `cached_property` to reduce unecessary memory and cpu costs

## 0.8.7

Expand Down
2 changes: 1 addition & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
)
elements = page.get_elements_with_detection_model(inplace=False)
page.get_elements_with_detection_model(inplace=True)
assert elements == page.elements
assert elements == page.elements_array


class MockPool:
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.8-dev0" # pragma: no cover
__version__ = "0.8.8-dev1" # pragma: no cover
19 changes: 12 additions & 7 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import tempfile
from functools import cached_property
from pathlib import PurePath
from typing import Any, BinaryIO, Collection, List, Optional, Union, cast

Expand Down Expand Up @@ -149,7 +150,6 @@ def __init__(
self.number = number
self.detection_model = detection_model
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
Expand All @@ -159,10 +159,18 @@ def __init__(
def __str__(self) -> str:
return "\n\n".join([str(element) for element in self.elements])

@cached_property
def elements(self) -> Collection[LayoutElement]:
"""return a list of layout elements from the array data structure; intended for backward
compatibility"""
if self.elements_array is None:
return []
return self.elements_array.as_list()

def get_elements_using_image_extraction(
self,
inplace=True,
) -> Optional[List[LayoutElement]]:
) -> Optional[list[LayoutElement]]:
"""Uses end-to-end text element extraction model to extract the elements on the page."""
if self.element_extraction_model is None:
raise ValueError(
Expand All @@ -178,8 +186,7 @@ def get_elements_using_image_extraction(
def get_elements_with_detection_model(
self,
inplace: bool = True,
array_only: bool = False,
) -> Optional[List[LayoutElement]]:
) -> Optional[LayoutElements]:
"""Uses specified model to detect the elements on the page."""
if self.detection_model is None:
model = get_model()
Expand All @@ -198,11 +205,9 @@ def get_elements_with_detection_model(

if inplace:
self.elements_array = inferred_layout
if not array_only:
self.elements = inferred_layout.as_list()
return None

return inferred_layout.as_list()
return inferred_layout

def _get_image_array(self) -> Union[np.ndarray[Any, Any], None]:
"""Converts the raw image into a numpy array."""
Expand Down