From 2b0f2d17a1d5373c07164826e2a6b737277aef63 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 15 Oct 2024 13:29:11 -0500 Subject: [PATCH 1/7] fix: fix element class id 0 becomes None bug --- test_unstructured_inference/test_elements.py | 11 ++++++ .../inference/layoutelement.py | 36 ++++++++++++++++--- unstructured_inference/models/yolox.py | 2 +- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index b99a55b1..3cf97197 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -421,3 +421,14 @@ def test_clean_layoutelements_for_class( elements = clean_layoutelements_for_class(elements, element_class=class_to_filter) np.testing.assert_array_equal(elements.element_coords, expected_coords) np.testing.assert_array_equal(elements.element_class_ids, expected_ids) + + +def test_layoutelements_to_list_and_back(test_layoutelements): + back = LayoutElements.from_list(test_layoutelements.as_list()) + np.testing.assert_array_equal(test_layoutelements.element_coords, back.element_coords) + np.testing.assert_array_equal(test_layoutelements.texts, back.texts) + assert all(np.isnan(back.element_probs)) + assert [ + test_layoutelements.element_class_id_map[idx] + for idx in test_layoutelements.element_class_ids + ] == [back.element_class_id_map[idx] for idx in back.element_class_ids] diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 9341ab2d..11746fb2 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -35,12 +35,11 @@ class LayoutElements(TextRegions): element_class_id_map: dict[int, str] | None = None def __post_init__(self): - if self.element_probs is not None: - self.element_probs = self.element_probs.astype(float) element_size = self.element_coords.shape[0] for attr in ("element_probs", "element_class_ids", "texts"): if getattr(self, attr).size == 0 and element_size: setattr(self, attr, np.array([None] * element_size)) + self.element_probs = self.element_probs.astype(float) def slice(self, indices) -> LayoutElements: """slice and return only selected indices""" @@ -85,7 +84,7 @@ def as_list(self): text=text, type=( self.element_class_id_map[class_id] - if class_id and self.element_class_id_map + if class_id is not None and self.element_class_id_map else None ), prob=prob, @@ -99,6 +98,35 @@ def as_list(self): ) ] + @classmethod + def from_list(cls, elements: list[LayoutElement]): + """create LayoutElements from a list of LayoutElement objects; the objects must have the same + source""" + len_ele = len(elements) + coords = np.empty((len_ele, 4), dtype=float) + # text and probs can be Nones so use lists first then convert into array to avoid them being + # filled as nan + texts = [] + class_probs = [] + class_types = np.empty((len_ele,), dtype="object") + + for i, element in enumerate(elements): + coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2] + texts.append(element.text) + class_probs.append(element.prob) + class_types[i] = element.type + + unique_ids, class_ids = np.unique(class_types, return_inverse=True) + + return cls( + element_coords=coords, + texts=np.array(texts), + element_probs=np.array(class_probs), + element_class_ids=class_ids, + element_class_id_map=dict(zip(range(len(unique_ids)), unique_ids)), + source=elements[0].source, + ) + @dataclass class LayoutElement(TextRegion): @@ -315,7 +343,7 @@ def partition_groups_from_regions(regions: TextRegions) -> List[TextRegions]: regions, each list corresponding with a group""" if len(regions) == 0: return [] - padded_coords = regions.element_coords.copy() + padded_coords = regions.element_coords.copy().astype(float) v_pad = (regions.y2 - regions.y1) * inference_config.ELEMENTS_V_PADDING_COEF h_pad = (regions.x2 - regions.x1) * inference_config.ELEMENTS_H_PADDING_COEF padded_coords[:, 0] -= h_pad diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 031ac2b2..8e57843d 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -136,7 +136,7 @@ def image_processing( sorted_dets = dets[order] return LayoutElements( - element_coords=sorted_dets[:, :4], + element_coords=sorted_dets[:, :4].astype(float), element_probs=sorted_dets[:, 4].astype(float), element_class_ids=sorted_dets[:, 5].astype(int), element_class_id_map=self.layout_classes, From 74bb42b8414101d63c758617c5f8934f0fa7382e Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 15 Oct 2024 13:42:25 -0500 Subject: [PATCH 2/7] update changelog and bump version --- CHANGELOG.md | 5 +++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2526031e..38ea33db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.7.41 + +* fix: fix incorrect type casting with higher versions of `numpy` when substracting a `float` from an `int` array +* fix: fix a bug where class id 0 becomes class type `None` when calling `LayoutElements.as_list()` + ## 0.7.40 * fix: store probabilities with `float` data type instead of `int` diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 53ea3558..bb85a995 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.40" # pragma: no cover +__version__ = "0.7.41" # pragma: no cover From e97dd4a4918bf4a6bb6e90bb57d4e56df2b9ec35 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 15 Oct 2024 13:43:31 -0500 Subject: [PATCH 3/7] fix line length --- unstructured_inference/inference/layoutelement.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 11746fb2..2a2c3c00 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -100,8 +100,8 @@ def as_list(self): @classmethod def from_list(cls, elements: list[LayoutElement]): - """create LayoutElements from a list of LayoutElement objects; the objects must have the same - source""" + """create LayoutElements from a list of LayoutElement objects; the objects must have the + same source""" len_ele = len(elements) coords = np.empty((len_ele, 4), dtype=float) # text and probs can be Nones so use lists first then convert into array to avoid them being From df1c82f8d283d6fd10063ebc2951bd0dab784c51 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 15 Oct 2024 16:18:59 -0500 Subject: [PATCH 4/7] fix: lint --- unstructured_inference/inference/elements.py | 2 +- unstructured_inference/inference/layoutelement.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 8e6596af..939ea0cc 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -237,7 +237,7 @@ def as_list(self): ] @classmethod - def from_list(cls, regions: list[TextRegion]): + def from_list(cls, regions: list): """create TextRegions from a list of TextRegion objects; the objects must have the same source""" coords, texts = [], [] diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 2a2c3c00..b07749b8 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -99,7 +99,7 @@ def as_list(self): ] @classmethod - def from_list(cls, elements: list[LayoutElement]): + def from_list(cls, elements: list): """create LayoutElements from a list of LayoutElement objects; the objects must have the same source""" len_ele = len(elements) From 933dfd4bb1364c400776dff85749809b145c347e Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 15 Oct 2024 17:48:52 -0500 Subject: [PATCH 5/7] fix: more type dealing with none vs nan --- .../inference/layoutelement.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b07749b8..436d3539 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -39,8 +39,24 @@ def __post_init__(self): for attr in ("element_probs", "element_class_ids", "texts"): if getattr(self, attr).size == 0 and element_size: setattr(self, attr, np.array([None] * element_size)) + self.element_probs = self.element_probs.astype(float) + def __eq__(self, other: LayoutElements) -> bool: + mask = ~np.isnan(self.element_probs) + other_mask = ~np.isnan(other.element_probs) + return ( + np.array_equal(self.element_coords, other.element_coords) + and np.array_equal(self.texts, other.texts) + and np.array_equal(mask, other_mask) + and np.array_equal(self.element_probs[mask], other.element_probs[mask]) + and ( + [self.element_class_id_map[idx] for idx in self.element_class_ids] + == [other.element_class_id_map[idx] for idx in other.element_class_ids] + ) + and self.source == other.source + ) + def slice(self, indices) -> LayoutElements: """slice and return only selected indices""" return LayoutElements( @@ -87,7 +103,7 @@ def as_list(self): if class_id is not None and self.element_class_id_map else None ), - prob=prob, + prob=None if np.isnan(prob) else prob, source=self.source, ) for (x1, y1, x2, y2), text, prob, class_id in zip( @@ -114,9 +130,10 @@ def from_list(cls, elements: list): coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2] texts.append(element.text) class_probs.append(element.prob) - class_types[i] = element.type + class_types[i] = element.type or "None" unique_ids, class_ids = np.unique(class_types, return_inverse=True) + unique_ids[unique_ids == "None"] = None return cls( element_coords=coords, From d7527554677baa48ed7ae5721147ddbba0a2deb6 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 16 Oct 2024 09:18:23 -0500 Subject: [PATCH 6/7] fix lint - fix eq override - use default factory to ensure element id mapping is always dict --- unstructured_inference/inference/layoutelement.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 436d3539..1d20d498 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -32,7 +32,7 @@ class LayoutElements(TextRegions): element_probs: np.ndarray = field(default_factory=lambda: np.array([])) element_class_ids: np.ndarray = field(default_factory=lambda: np.array([])) - element_class_id_map: dict[int, str] | None = None + element_class_id_map: dict[int, str] = field(default_factory=dict) def __post_init__(self): element_size = self.element_coords.shape[0] @@ -42,7 +42,10 @@ def __post_init__(self): self.element_probs = self.element_probs.astype(float) - def __eq__(self, other: LayoutElements) -> bool: + def __eq__(self, other: object) -> bool: + if not isinstance(other, LayoutElements): + return NotImplemented + mask = ~np.isnan(self.element_probs) other_mask = ~np.isnan(other.element_probs) return ( From 9a2706630e2252459a4f56eceea19be698e7b6a2 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 16 Oct 2024 10:55:19 -0500 Subject: [PATCH 7/7] test: add test to different coord data types --- test_unstructured_inference/test_elements.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 3cf97197..6627e205 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -143,8 +143,10 @@ def test_minimal_containing_rect(): assert rect2.is_in(big_rect) -def test_partition_groups_from_regions(mock_embedded_text_regions): +@pytest.mark.parametrize("coord_type", [int, float]) +def test_partition_groups_from_regions(mock_embedded_text_regions, coord_type): words = TextRegions.from_list(mock_embedded_text_regions) + words.element_coords = words.element_coords.astype(coord_type) groups = partition_groups_from_regions(words) assert len(groups) == 1 text = "".join(groups[-1].texts)