Feat/add more attributes to layoutelements (#404)

badGarnet · web-flow · commit 4d0c20aab181 · 2025-01-20T18:29:41.000-06:00
* feat: add `text_as_html` and `table_as_cells` to `LayoutElements`
class as new attributes
* feat: replace the single valueed `source` attribute from `TextRegions`
and `LayoutElements` with an array attribute `sources`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.8.4
+
+* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes
+* feat: replace the single valueed `source` attribute from `TextRegions` and `LayoutElements` with an array attribute `sources`
+
 ## 0.8.3
 
 * fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
diff --git a/test_unstructured_inference/models/test_yolox.py b/test_unstructured_inference/models/test_yolox.py
@@ -32,8 +32,6 @@ def test_layout_yolox_local_parsing_image():
 def test_layout_yolox_local_parsing_pdf():
     filename = os.path.join("sample-docs", "loremipsum.pdf")
     document_layout = process_file_with_model(filename, model_name="yolox")
-    content = str(document_layout)
-    assert "libero fringilla" in content
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) The example sent to the test contains 5 text detections
     text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
@@ -61,7 +61,7 @@ def test_layoutelements():
         element_coords=coords,
         element_class_ids=element_class_ids,
         element_class_id_map=class_map,
-        source="yolox",
+        sources=np.array(["yolox"] * len(element_class_ids)),
     )
 
 
@@ -440,32 +440,33 @@ def test_layoutelements_to_list_and_back(test_layoutelements):
 
 def test_layoutelements_from_list_no_elements():
     back = LayoutElements.from_list(elements=[])
-    assert back.source is None
+    assert back.sources.size == 0
     assert back.element_coords.size == 0
 
 
 def test_textregions_from_list_no_elements():
     back = TextRegions.from_list(regions=[])
-    assert back.source is None
+    assert back.sources.size == 0
     assert back.element_coords.size == 0
 
 
 def test_layoutelements_concatenate():
     layout1 = LayoutElements(
         element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]),
         texts=np.array(["a", "two"]),
-        source=None,
+        sources=np.array(["yolox", "yolox"]),
         element_class_ids=np.array([0, 1]),
         element_class_id_map={0: "type0", 1: "type1"},
     )
     layout2 = LayoutElements(
         element_coords=np.array([[10, 10, 2, 2], [20, 20, 1, 1]]),
         texts=np.array(["three", "4"]),
-        source=None,
+        sources=np.array(["ocr", "ocr"]),
         element_class_ids=np.array([0, 1]),
         element_class_id_map={0: "type1", 1: "type2"},
     )
     joint = LayoutElements.concatenate([layout1, layout2])
     assert joint.texts.tolist() == ["a", "two", "three", "4"]
+    assert joint.sources.tolist() == ["yolox", "yolox", "ocr", "ocr"]
     assert joint.element_class_ids.tolist() == [0, 1, 1, 2]
     assert joint.element_class_id_map == {0: "type0", 1: "type1", 2: "type2"}
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.8.3"  # pragma: no cover
+__version__ = "0.8.4"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -210,7 +210,7 @@ def from_coords(
 class TextRegions:
     element_coords: np.ndarray
     texts: np.ndarray = field(default_factory=lambda: np.array([]))
-    source: Source | None = None
+    sources: np.ndarray = field(default_factory=lambda: np.array([]))
 
     def __post_init__(self):
         if self.texts.size == 0 and self.element_coords.size > 0:
@@ -221,31 +221,37 @@ def slice(self, indices) -> TextRegions:
         return TextRegions(
             element_coords=self.element_coords[indices],
             texts=self.texts[indices],
-            source=self.source,
+            sources=self.sources[indices],
         )
 
+    def iter_elements(self):
+        """iter text regions as one TextRegion per iteration; this returns a generator and has less
+        memory impact than the as_list method"""
+        for (x1, y1, x2, y2), text, source in zip(
+            self.element_coords,
+            self.texts,
+            self.sources,
+        ):
+            yield TextRegion.from_coords(x1, y1, x2, y2, text, source)
+
     def as_list(self):
-        """return a list of TextRegion objects representing the data"""
-        if self.texts is None:
-            return [
-                TextRegion.from_coords(x1, y1, x2, y2, None, self.source)
-                for (x1, y1, x2, y2) in self.element_coords
-            ]
-        return [
-            TextRegion.from_coords(x1, y1, x2, y2, text, self.source)
-            for (x1, y1, x2, y2), text in zip(self.element_coords, self.texts)
-        ]
+        """return a list of LayoutElement for backward compatibility"""
+        return list(self.iter_elements())
 
     @classmethod
     def from_list(cls, regions: list):
         """create TextRegions from a list of TextRegion objects; the objects must have the same
         source"""
-        coords, texts = [], []
+        coords, texts, sources = [], [], []
         for region in regions:
             coords.append((region.bbox.x1, region.bbox.y1, region.bbox.x2, region.bbox.y2))
             texts.append(region.text)
-        source = regions[0].source if regions else None
-        return cls(element_coords=np.array(coords), texts=np.array(texts), source=source)
+            sources.append(region.source)
+        return cls(
+            element_coords=np.array(coords),
+            texts=np.array(texts),
+            sources=np.array(sources),
+        )
 
     def __len__(self):
         return self.element_coords.shape[0]
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -30,10 +30,21 @@ class LayoutElements(TextRegions):
     element_probs: np.ndarray = field(default_factory=lambda: np.array([]))
     element_class_ids: np.ndarray = field(default_factory=lambda: np.array([]))
     element_class_id_map: dict[int, str] = field(default_factory=dict)
+    text_as_html: np.ndarray = field(default_factory=lambda: np.array([]))
+    table_as_cells: np.ndarray = field(default_factory=lambda: np.array([]))
 
     def __post_init__(self):
         element_size = self.element_coords.shape[0]
-        for attr in ("element_probs", "element_class_ids", "texts"):
+        # NOTE: maybe we should create an attribute _optional_attributes: list[str] to store this
+        # list
+        for attr in (
+            "element_probs",
+            "element_class_ids",
+            "texts",
+            "sources",
+            "text_as_html",
+            "table_as_cells",
+        ):
             if getattr(self, attr).size == 0 and element_size:
                 setattr(self, attr, np.array([None] * element_size))
 
@@ -54,31 +65,37 @@ def __eq__(self, other: object) -> bool:
                 [self.element_class_id_map[idx] for idx in self.element_class_ids]
                 == [other.element_class_id_map[idx] for idx in other.element_class_ids]
             )
-            and self.source == other.source
+            and np.array_equal(self.sources[mask], other.sources[mask])
+            and np.array_equal(self.text_as_html[mask], other.text_as_html[mask])
+            and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask])
         )
 
     def slice(self, indices) -> LayoutElements:
         """slice and return only selected indices"""
         return LayoutElements(
             element_coords=self.element_coords[indices],
             texts=self.texts[indices],
-            source=self.source,
+            sources=self.sources[indices],
             element_probs=self.element_probs[indices],
             element_class_ids=self.element_class_ids[indices],
             element_class_id_map=self.element_class_id_map,
+            text_as_html=self.text_as_html[indices],
+            table_as_cells=self.table_as_cells[indices],
         )
 
     @classmethod
     def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements:
         """concatenate a sequence of LayoutElements in order as one LayoutElements"""
         coords, texts, probs, class_ids, sources = [], [], [], [], []
+        text_as_html, table_as_cells = [], []
         class_id_reverse_map: dict[str, int] = {}
         for group in groups:
             coords.append(group.element_coords)
             texts.append(group.texts)
             probs.append(group.element_probs)
-            if group.source:
-                sources.append(group.source)
+            sources.append(group.sources)
+            text_as_html.append(group.text_as_html)
+            table_as_cells.append(group.table_as_cells)
 
             idx = group.element_class_ids.copy()
             if group.element_class_id_map:
@@ -97,13 +114,24 @@ def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements:
             element_probs=np.concatenate(probs),
             element_class_ids=np.concatenate(class_ids),
             element_class_id_map={v: k for k, v in class_id_reverse_map.items()},
-            source=sources[0] if sources else None,
+            sources=np.concatenate(sources),
+            text_as_html=np.concatenate(text_as_html),
+            table_as_cells=np.concatenate(table_as_cells),
         )
 
-    def as_list(self):
-        """return a list of LayoutElement for backward compatibility"""
-        return [
-            LayoutElement.from_coords(
+    def iter_elements(self):
+        """iter elements as one LayoutElement per iteration; this returns a generator and has less
+        memory impact than the as_list method"""
+        for (x1, y1, x2, y2), text, prob, class_id, source, text_as_html, table_as_cells in zip(
+            self.element_coords,
+            self.texts,
+            self.element_probs,
+            self.element_class_ids,
+            self.sources,
+            self.text_as_html,
+            self.table_as_cells,
+        ):
+            yield LayoutElement.from_coords(
                 x1,
                 y1,
                 x2,
@@ -115,15 +143,10 @@ def as_list(self):
                     else None
                 ),
                 prob=None if np.isnan(prob) else prob,
-                source=self.source,
+                source=source,
+                text_as_html=text_as_html,
+                table_as_cells=table_as_cells,
             )
-            for (x1, y1, x2, y2), text, prob, class_id in zip(
-                self.element_coords,
-                self.texts,
-                self.element_probs,
-                self.element_class_ids,
-            )
-        ]
 
     @classmethod
     def from_list(cls, elements: list):
@@ -133,13 +156,15 @@ def from_list(cls, elements: list):
         coords = np.empty((len_ele, 4), dtype=float)
         # text and probs can be Nones so use lists first then convert into array to avoid them being
         # filled as nan
-        texts = []
-        class_probs = []
+        texts, text_as_html, table_as_cells, sources, class_probs = [], [], [], [], []
         class_types = np.empty((len_ele,), dtype="object")
 
         for i, element in enumerate(elements):
             coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2]
             texts.append(element.text)
+            sources.append(element.source)
+            text_as_html.append(element.text_as_html)
+            table_as_cells.append(element.table_as_cells)
             class_probs.append(element.prob)
             class_types[i] = element.type or "None"
 
@@ -152,7 +177,9 @@ def from_list(cls, elements: list):
             element_probs=np.array(class_probs),
             element_class_ids=class_ids,
             element_class_id_map=dict(zip(range(len(unique_ids)), unique_ids)),
-            source=elements[0].source if len_ele else None,
+            sources=np.array(sources),
+            text_as_html=np.array(text_as_html),
+            table_as_cells=np.array(table_as_cells),
         )
 
 
@@ -162,6 +189,8 @@ class LayoutElement(TextRegion):
     prob: Optional[float] = None
     image_path: Optional[str] = None
     parent: Optional[LayoutElement] = None
+    text_as_html: Optional[str] = None
+    table_as_cells: Optional[str] = None
 
     def to_dict(self) -> dict:
         """Converts the class instance to dictionary form."""
@@ -432,9 +461,8 @@ def clean_layoutelements(elements: LayoutElements, subregion_threshold: float =
 
     final_attrs: dict[str, Any] = {
         "element_class_id_map": elements.element_class_id_map,
-        "source": elements.source,
     }
-    for attr in ("element_class_ids", "element_probs", "texts"):
+    for attr in ("element_class_ids", "element_probs", "texts", "sources"):
         if (original_attr := getattr(elements, attr)) is None:
             continue
         final_attrs[attr] = original_attr[sorted_by_area][mask][sorted_by_y1]
@@ -510,7 +538,7 @@ def clean_layoutelements_for_class(
 
     final_coords = np.vstack([target_coords[mask], other_coords[other_mask]])
     final_attrs: dict[str, Any] = {"element_class_id_map": elements.element_class_id_map}
-    for attr in ("element_class_ids", "element_probs", "texts"):
+    for attr in ("element_class_ids", "element_probs", "texts", "sources"):
         if (original_attr := getattr(elements, attr)) is None:
             continue
         final_attrs[attr] = np.concatenate(
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
@@ -140,7 +140,7 @@ def image_processing(
             element_probs=sorted_dets[:, 4].astype(float),
             element_class_ids=sorted_dets[:, 5].astype(int),
             element_class_id_map=self.layout_classes,
-            source=Source.YOLOX,
+            sources=np.array([Source.YOLOX] * sorted_dets.shape[0]),
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.8.3" # pragma: no cover`
	`1`	`+__version__ = "0.8.4" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,7 @@ def image_processing(`
`140`	`140`	`element_probs=sorted_dets[:, 4].astype(float),`
`141`	`141`	`element_class_ids=sorted_dets[:, 5].astype(int),`
`142`	`142`	`element_class_id_map=self.layout_classes,`
`143`		`- source=Source.YOLOX,`
	`143`	`+ sources=np.array([Source.YOLOX] * sorted_dets.shape[0]),`
`144`	`144`	`)`
`145`	`145`
`146`	`146`