diff --git a/CHANGELOG.md b/CHANGELOG.md index 8595d70a..6a83327c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.8.4 + +* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes +* feat: replace the single valueed `source` attribute from `TextRegions` and `LayoutElements` with an array attribute `sources` + ## 0.8.3 * fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used diff --git a/test_unstructured_inference/models/test_yolox.py b/test_unstructured_inference/models/test_yolox.py index 323be239..e8e996d0 100644 --- a/test_unstructured_inference/models/test_yolox.py +++ b/test_unstructured_inference/models/test_yolox.py @@ -32,8 +32,6 @@ def test_layout_yolox_local_parsing_image(): def test_layout_yolox_local_parsing_pdf(): filename = os.path.join("sample-docs", "loremipsum.pdf") document_layout = process_file_with_model(filename, model_name="yolox") - content = str(document_layout) - assert "libero fringilla" in content assert len(document_layout.pages) == 1 # NOTE(benjamin) The example sent to the test contains 5 text detections text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"] diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 071c0840..d5aa5873 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -61,7 +61,7 @@ def test_layoutelements(): element_coords=coords, element_class_ids=element_class_ids, element_class_id_map=class_map, - source="yolox", + sources=np.array(["yolox"] * len(element_class_ids)), ) @@ -440,13 +440,13 @@ def test_layoutelements_to_list_and_back(test_layoutelements): def test_layoutelements_from_list_no_elements(): back = LayoutElements.from_list(elements=[]) - assert back.source is None + assert back.sources.size == 0 assert back.element_coords.size == 0 def test_textregions_from_list_no_elements(): back = TextRegions.from_list(regions=[]) - assert back.source is None + assert back.sources.size == 0 assert back.element_coords.size == 0 @@ -454,18 +454,19 @@ def test_layoutelements_concatenate(): layout1 = LayoutElements( element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]), texts=np.array(["a", "two"]), - source=None, + sources=np.array(["yolox", "yolox"]), element_class_ids=np.array([0, 1]), element_class_id_map={0: "type0", 1: "type1"}, ) layout2 = LayoutElements( element_coords=np.array([[10, 10, 2, 2], [20, 20, 1, 1]]), texts=np.array(["three", "4"]), - source=None, + sources=np.array(["ocr", "ocr"]), element_class_ids=np.array([0, 1]), element_class_id_map={0: "type1", 1: "type2"}, ) joint = LayoutElements.concatenate([layout1, layout2]) assert joint.texts.tolist() == ["a", "two", "three", "4"] + assert joint.sources.tolist() == ["yolox", "yolox", "ocr", "ocr"] assert joint.element_class_ids.tolist() == [0, 1, 1, 2] assert joint.element_class_id_map == {0: "type0", 1: "type1", 2: "type2"} diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 54090120..37769adf 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.8.3" # pragma: no cover +__version__ = "0.8.4" # pragma: no cover diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 19fa02ff..c4de7be6 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -210,7 +210,7 @@ def from_coords( class TextRegions: element_coords: np.ndarray texts: np.ndarray = field(default_factory=lambda: np.array([])) - source: Source | None = None + sources: np.ndarray = field(default_factory=lambda: np.array([])) def __post_init__(self): if self.texts.size == 0 and self.element_coords.size > 0: @@ -221,31 +221,37 @@ def slice(self, indices) -> TextRegions: return TextRegions( element_coords=self.element_coords[indices], texts=self.texts[indices], - source=self.source, + sources=self.sources[indices], ) + def iter_elements(self): + """iter text regions as one TextRegion per iteration; this returns a generator and has less + memory impact than the as_list method""" + for (x1, y1, x2, y2), text, source in zip( + self.element_coords, + self.texts, + self.sources, + ): + yield TextRegion.from_coords(x1, y1, x2, y2, text, source) + def as_list(self): - """return a list of TextRegion objects representing the data""" - if self.texts is None: - return [ - TextRegion.from_coords(x1, y1, x2, y2, None, self.source) - for (x1, y1, x2, y2) in self.element_coords - ] - return [ - TextRegion.from_coords(x1, y1, x2, y2, text, self.source) - for (x1, y1, x2, y2), text in zip(self.element_coords, self.texts) - ] + """return a list of LayoutElement for backward compatibility""" + return list(self.iter_elements()) @classmethod def from_list(cls, regions: list): """create TextRegions from a list of TextRegion objects; the objects must have the same source""" - coords, texts = [], [] + coords, texts, sources = [], [], [] for region in regions: coords.append((region.bbox.x1, region.bbox.y1, region.bbox.x2, region.bbox.y2)) texts.append(region.text) - source = regions[0].source if regions else None - return cls(element_coords=np.array(coords), texts=np.array(texts), source=source) + sources.append(region.source) + return cls( + element_coords=np.array(coords), + texts=np.array(texts), + sources=np.array(sources), + ) def __len__(self): return self.element_coords.shape[0] diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 40cd0820..44af28fc 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -30,10 +30,21 @@ class LayoutElements(TextRegions): element_probs: np.ndarray = field(default_factory=lambda: np.array([])) element_class_ids: np.ndarray = field(default_factory=lambda: np.array([])) element_class_id_map: dict[int, str] = field(default_factory=dict) + text_as_html: np.ndarray = field(default_factory=lambda: np.array([])) + table_as_cells: np.ndarray = field(default_factory=lambda: np.array([])) def __post_init__(self): element_size = self.element_coords.shape[0] - for attr in ("element_probs", "element_class_ids", "texts"): + # NOTE: maybe we should create an attribute _optional_attributes: list[str] to store this + # list + for attr in ( + "element_probs", + "element_class_ids", + "texts", + "sources", + "text_as_html", + "table_as_cells", + ): if getattr(self, attr).size == 0 and element_size: setattr(self, attr, np.array([None] * element_size)) @@ -54,7 +65,9 @@ def __eq__(self, other: object) -> bool: [self.element_class_id_map[idx] for idx in self.element_class_ids] == [other.element_class_id_map[idx] for idx in other.element_class_ids] ) - and self.source == other.source + and np.array_equal(self.sources[mask], other.sources[mask]) + and np.array_equal(self.text_as_html[mask], other.text_as_html[mask]) + and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask]) ) def slice(self, indices) -> LayoutElements: @@ -62,23 +75,27 @@ def slice(self, indices) -> LayoutElements: return LayoutElements( element_coords=self.element_coords[indices], texts=self.texts[indices], - source=self.source, + sources=self.sources[indices], element_probs=self.element_probs[indices], element_class_ids=self.element_class_ids[indices], element_class_id_map=self.element_class_id_map, + text_as_html=self.text_as_html[indices], + table_as_cells=self.table_as_cells[indices], ) @classmethod def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements: """concatenate a sequence of LayoutElements in order as one LayoutElements""" coords, texts, probs, class_ids, sources = [], [], [], [], [] + text_as_html, table_as_cells = [], [] class_id_reverse_map: dict[str, int] = {} for group in groups: coords.append(group.element_coords) texts.append(group.texts) probs.append(group.element_probs) - if group.source: - sources.append(group.source) + sources.append(group.sources) + text_as_html.append(group.text_as_html) + table_as_cells.append(group.table_as_cells) idx = group.element_class_ids.copy() if group.element_class_id_map: @@ -97,13 +114,24 @@ def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements: element_probs=np.concatenate(probs), element_class_ids=np.concatenate(class_ids), element_class_id_map={v: k for k, v in class_id_reverse_map.items()}, - source=sources[0] if sources else None, + sources=np.concatenate(sources), + text_as_html=np.concatenate(text_as_html), + table_as_cells=np.concatenate(table_as_cells), ) - def as_list(self): - """return a list of LayoutElement for backward compatibility""" - return [ - LayoutElement.from_coords( + def iter_elements(self): + """iter elements as one LayoutElement per iteration; this returns a generator and has less + memory impact than the as_list method""" + for (x1, y1, x2, y2), text, prob, class_id, source, text_as_html, table_as_cells in zip( + self.element_coords, + self.texts, + self.element_probs, + self.element_class_ids, + self.sources, + self.text_as_html, + self.table_as_cells, + ): + yield LayoutElement.from_coords( x1, y1, x2, @@ -115,15 +143,10 @@ def as_list(self): else None ), prob=None if np.isnan(prob) else prob, - source=self.source, + source=source, + text_as_html=text_as_html, + table_as_cells=table_as_cells, ) - for (x1, y1, x2, y2), text, prob, class_id in zip( - self.element_coords, - self.texts, - self.element_probs, - self.element_class_ids, - ) - ] @classmethod def from_list(cls, elements: list): @@ -133,13 +156,15 @@ def from_list(cls, elements: list): coords = np.empty((len_ele, 4), dtype=float) # text and probs can be Nones so use lists first then convert into array to avoid them being # filled as nan - texts = [] - class_probs = [] + texts, text_as_html, table_as_cells, sources, class_probs = [], [], [], [], [] class_types = np.empty((len_ele,), dtype="object") for i, element in enumerate(elements): coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2] texts.append(element.text) + sources.append(element.source) + text_as_html.append(element.text_as_html) + table_as_cells.append(element.table_as_cells) class_probs.append(element.prob) class_types[i] = element.type or "None" @@ -152,7 +177,9 @@ def from_list(cls, elements: list): element_probs=np.array(class_probs), element_class_ids=class_ids, element_class_id_map=dict(zip(range(len(unique_ids)), unique_ids)), - source=elements[0].source if len_ele else None, + sources=np.array(sources), + text_as_html=np.array(text_as_html), + table_as_cells=np.array(table_as_cells), ) @@ -162,6 +189,8 @@ class LayoutElement(TextRegion): prob: Optional[float] = None image_path: Optional[str] = None parent: Optional[LayoutElement] = None + text_as_html: Optional[str] = None + table_as_cells: Optional[str] = None def to_dict(self) -> dict: """Converts the class instance to dictionary form.""" @@ -432,9 +461,8 @@ def clean_layoutelements(elements: LayoutElements, subregion_threshold: float = final_attrs: dict[str, Any] = { "element_class_id_map": elements.element_class_id_map, - "source": elements.source, } - for attr in ("element_class_ids", "element_probs", "texts"): + for attr in ("element_class_ids", "element_probs", "texts", "sources"): if (original_attr := getattr(elements, attr)) is None: continue final_attrs[attr] = original_attr[sorted_by_area][mask][sorted_by_y1] @@ -510,7 +538,7 @@ def clean_layoutelements_for_class( final_coords = np.vstack([target_coords[mask], other_coords[other_mask]]) final_attrs: dict[str, Any] = {"element_class_id_map": elements.element_class_id_map} - for attr in ("element_class_ids", "element_probs", "texts"): + for attr in ("element_class_ids", "element_probs", "texts", "sources"): if (original_attr := getattr(elements, attr)) is None: continue final_attrs[attr] = np.concatenate( diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 8e57843d..932242ec 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -140,7 +140,7 @@ def image_processing( element_probs=sorted_dets[:, 4].astype(float), element_class_ids=sorted_dets[:, 5].astype(int), element_class_id_map=self.layout_classes, - source=Source.YOLOX, + sources=np.array([Source.YOLOX] * sorted_dets.shape[0]), )