From 4a53674a27af1a69034e3a0310e5a1a0cb532b62 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 3 Jun 2025 14:40:47 -0500 Subject: [PATCH 1/6] Test slicing works like numpy --- test_unstructured_inference/test_elements.py | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 89efd870..3f5d6336 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -472,3 +472,27 @@ def test_layoutelements_concatenate(): assert joint.sources.tolist() == ["yolox", "yolox", "ocr", "ocr"] assert joint.element_class_ids.tolist() == [0, 1, 1, 2] assert joint.element_class_id_map == {0: "type0", 1: "type1", 2: "type2"} + + +def test_textregions_support_numpy_slicing(): + trs = TextRegions( + element_coords=np.array( + [ + [0.0, 0.0, 1.0, 1.0], + [1.0, 0.0, 1.5, 1.0], + [2.0, 0.0, 2.5, 1.0], + [3.0, 0.0, 4.0, 1.0], + [4.0, 0.0, 5.0, 1.0], + ] + ), + texts=np.array(["0", "1", "2", "3", "4"]), + sources=np.array(["foo", "foo", "foo", "foo", "foo"], dtype=" Date: Tue, 3 Jun 2025 14:56:37 -0500 Subject: [PATCH 2/6] add tests for slicing behavior --- test_unstructured_inference/test_elements.py | 60 +++++++++++++------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 3f5d6336..81db5f2f 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -474,25 +474,47 @@ def test_layoutelements_concatenate(): assert joint.element_class_id_map == {0: "type0", 1: "type1", 2: "type2"} -def test_textregions_support_numpy_slicing(): - trs = TextRegions( - element_coords=np.array( - [ - [0.0, 0.0, 1.0, 1.0], - [1.0, 0.0, 1.5, 1.0], - [2.0, 0.0, 2.5, 1.0], - [3.0, 0.0, 4.0, 1.0], - [4.0, 0.0, 5.0, 1.0], - ] +@pytest.mark.parametrize( + "test_elements", + [ + TextRegions( + element_coords=np.array( + [ + [0.0, 0.0, 1.0, 1.0], + [1.0, 0.0, 1.5, 1.0], + [2.0, 0.0, 2.5, 1.0], + [3.0, 0.0, 4.0, 1.0], + [4.0, 0.0, 5.0, 1.0], + ] + ), + texts=np.array(["0", "1", "2", "3", "4"]), + sources=np.array(["foo", "foo", "foo", "foo", "foo"], dtype=" Date: Tue, 3 Jun 2025 14:57:19 -0500 Subject: [PATCH 3/6] update textregion --- unstructured_inference/inference/elements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 0c9b1a18..e77bfba8 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -226,7 +226,7 @@ def __post_init__(self): # we convert to float so data type is more consistent (e.g., None will be np.nan) self.element_coords = self.element_coords.astype(float) - def slice(self, indices) -> TextRegions: + def __getitem__(self, indices) -> TextRegions: """slice text regions based on indices""" return TextRegions( element_coords=self.element_coords[indices], From c2910ef6ee0833db507566b4d383212c680b6c17 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 3 Jun 2025 14:57:33 -0500 Subject: [PATCH 4/6] update layoutelement --- unstructured_inference/inference/layoutelement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b70ad651..37db5f23 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -75,7 +75,7 @@ def __eq__(self, other: object) -> bool: and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask]) ) - def slice(self, indices) -> LayoutElements: + def __getitem__(self, indices) -> LayoutElements: """slice and return only selected indices""" return LayoutElements( element_coords=self.element_coords[indices], From bde1fdad3d286d9b86a59c9274b1dfb65aa8dc6f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 3 Jun 2025 15:07:17 -0500 Subject: [PATCH 5/6] Preserve backward compat --- unstructured_inference/inference/elements.py | 3 +++ unstructured_inference/inference/layoutelement.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index e77bfba8..81647ced 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -227,6 +227,9 @@ def __post_init__(self): self.element_coords = self.element_coords.astype(float) def __getitem__(self, indices) -> TextRegions: + return self.slice(indices) + + def slice(self, indices) -> TextRegions: """slice text regions based on indices""" return TextRegions( element_coords=self.element_coords[indices], diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 37db5f23..5b4c6fda 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -75,7 +75,10 @@ def __eq__(self, other: object) -> bool: and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask]) ) - def __getitem__(self, indices) -> LayoutElements: + def __getitem__(self, indices): + return self.slice(indices) + + def slice(self, indices) -> LayoutElements: """slice and return only selected indices""" return LayoutElements( element_coords=self.element_coords[indices], From 1f6ee495e7edcbc97cf20e86c7cbedecb26ae4fa Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 3 Jun 2025 15:13:46 -0500 Subject: [PATCH 6/6] bump changelog --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b11d8ec8..262ef73b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.0.4 + +* Add slicing through indexing for vectorized elements + ## 1.0.3 * setting longest_edge=1333 to the table image processor diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 6520c47e..18934c58 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "1.0.3" # pragma: no cover +__version__ = "1.0.4" # pragma: no cover