diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d510a7e..846085b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.8.8-dev0 +## 0.8.8-dev1 * fix: pdfminer-six dependencies +* feat: `PageLayout.elements` is now a `cached_property` to reduce unecessary memory and cpu costs ## 0.8.7 diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index be0e8769..192bf8ed 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -116,7 +116,7 @@ def test_get_page_elements(monkeypatch, mock_final_layout): ) elements = page.get_elements_with_detection_model(inplace=False) page.get_elements_with_detection_model(inplace=True) - assert elements == page.elements + assert elements == page.elements_array class MockPool: diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 584832f5..0929fcec 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.8.8-dev0" # pragma: no cover +__version__ = "0.8.8-dev1" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 9b6897d3..57e17a08 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -2,6 +2,7 @@ import os import tempfile +from functools import cached_property from pathlib import PurePath from typing import Any, BinaryIO, Collection, List, Optional, Union, cast @@ -149,7 +150,6 @@ def __init__( self.number = number self.detection_model = detection_model self.element_extraction_model = element_extraction_model - self.elements: Collection[LayoutElement] = [] self.elements_array: LayoutElements | None = None self.password = password # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has @@ -159,10 +159,18 @@ def __init__( def __str__(self) -> str: return "\n\n".join([str(element) for element in self.elements]) + @cached_property + def elements(self) -> Collection[LayoutElement]: + """return a list of layout elements from the array data structure; intended for backward + compatibility""" + if self.elements_array is None: + return [] + return self.elements_array.as_list() + def get_elements_using_image_extraction( self, inplace=True, - ) -> Optional[List[LayoutElement]]: + ) -> Optional[list[LayoutElement]]: """Uses end-to-end text element extraction model to extract the elements on the page.""" if self.element_extraction_model is None: raise ValueError( @@ -178,8 +186,7 @@ def get_elements_using_image_extraction( def get_elements_with_detection_model( self, inplace: bool = True, - array_only: bool = False, - ) -> Optional[List[LayoutElement]]: + ) -> Optional[LayoutElements]: """Uses specified model to detect the elements on the page.""" if self.detection_model is None: model = get_model() @@ -198,11 +205,9 @@ def get_elements_with_detection_model( if inplace: self.elements_array = inferred_layout - if not array_only: - self.elements = inferred_layout.as_list() return None - return inferred_layout.as_list() + return inferred_layout def _get_image_array(self) -> Union[np.ndarray[Any, Any], None]: """Converts the raw image into a numpy array."""