Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.8.4

* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes
* feat: replace the single valueed `source` attribute from `TextRegions` and `LayoutElements` with an array attribute `sources`

## 0.8.3

* fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
Expand Down
2 changes: 0 additions & 2 deletions test_unstructured_inference/models/test_yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ def test_layout_yolox_local_parsing_image():
def test_layout_yolox_local_parsing_pdf():
filename = os.path.join("sample-docs", "loremipsum.pdf")
document_layout = process_file_with_model(filename, model_name="yolox")
content = str(document_layout)
assert "libero fringilla" in content
assert len(document_layout.pages) == 1
# NOTE(benjamin) The example sent to the test contains 5 text detections
text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
Expand Down
11 changes: 6 additions & 5 deletions test_unstructured_inference/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_layoutelements():
element_coords=coords,
element_class_ids=element_class_ids,
element_class_id_map=class_map,
source="yolox",
sources=np.array(["yolox"] * len(element_class_ids)),
)


Expand Down Expand Up @@ -440,32 +440,33 @@ def test_layoutelements_to_list_and_back(test_layoutelements):

def test_layoutelements_from_list_no_elements():
back = LayoutElements.from_list(elements=[])
assert back.source is None
assert back.sources.size == 0
assert back.element_coords.size == 0


def test_textregions_from_list_no_elements():
back = TextRegions.from_list(regions=[])
assert back.source is None
assert back.sources.size == 0
assert back.element_coords.size == 0


def test_layoutelements_concatenate():
layout1 = LayoutElements(
element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]),
texts=np.array(["a", "two"]),
source=None,
sources=np.array(["yolox", "yolox"]),
element_class_ids=np.array([0, 1]),
element_class_id_map={0: "type0", 1: "type1"},
)
layout2 = LayoutElements(
element_coords=np.array([[10, 10, 2, 2], [20, 20, 1, 1]]),
texts=np.array(["three", "4"]),
source=None,
sources=np.array(["ocr", "ocr"]),
element_class_ids=np.array([0, 1]),
element_class_id_map={0: "type1", 1: "type2"},
)
joint = LayoutElements.concatenate([layout1, layout2])
assert joint.texts.tolist() == ["a", "two", "three", "4"]
assert joint.sources.tolist() == ["yolox", "yolox", "ocr", "ocr"]
assert joint.element_class_ids.tolist() == [0, 1, 1, 2]
assert joint.element_class_id_map == {0: "type0", 1: "type1", 2: "type2"}
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.3" # pragma: no cover
__version__ = "0.8.4" # pragma: no cover
36 changes: 21 additions & 15 deletions unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def from_coords(
class TextRegions:
element_coords: np.ndarray
texts: np.ndarray = field(default_factory=lambda: np.array([]))
source: Source | None = None
sources: np.ndarray = field(default_factory=lambda: np.array([]))

def __post_init__(self):
if self.texts.size == 0 and self.element_coords.size > 0:
Expand All @@ -221,31 +221,37 @@ def slice(self, indices) -> TextRegions:
return TextRegions(
element_coords=self.element_coords[indices],
texts=self.texts[indices],
source=self.source,
sources=self.sources[indices],
)

def iter_elements(self):
"""iter text regions as one TextRegion per iteration; this returns a generator and has less
memory impact than the as_list method"""
for (x1, y1, x2, y2), text, source in zip(
self.element_coords,
self.texts,
self.sources,
):
yield TextRegion.from_coords(x1, y1, x2, y2, text, source)

def as_list(self):
"""return a list of TextRegion objects representing the data"""
if self.texts is None:
return [
TextRegion.from_coords(x1, y1, x2, y2, None, self.source)
for (x1, y1, x2, y2) in self.element_coords
]
return [
TextRegion.from_coords(x1, y1, x2, y2, text, self.source)
for (x1, y1, x2, y2), text in zip(self.element_coords, self.texts)
]
"""return a list of LayoutElement for backward compatibility"""
return list(self.iter_elements())

@classmethod
def from_list(cls, regions: list):
"""create TextRegions from a list of TextRegion objects; the objects must have the same
source"""
coords, texts = [], []
coords, texts, sources = [], [], []
for region in regions:
coords.append((region.bbox.x1, region.bbox.y1, region.bbox.x2, region.bbox.y2))
texts.append(region.text)
source = regions[0].source if regions else None
return cls(element_coords=np.array(coords), texts=np.array(texts), source=source)
sources.append(region.source)
return cls(
element_coords=np.array(coords),
texts=np.array(texts),
sources=np.array(sources),
)

def __len__(self):
return self.element_coords.shape[0]
Expand Down
76 changes: 52 additions & 24 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,21 @@ class LayoutElements(TextRegions):
element_probs: np.ndarray = field(default_factory=lambda: np.array([]))
element_class_ids: np.ndarray = field(default_factory=lambda: np.array([]))
element_class_id_map: dict[int, str] = field(default_factory=dict)
text_as_html: np.ndarray = field(default_factory=lambda: np.array([]))
table_as_cells: np.ndarray = field(default_factory=lambda: np.array([]))

def __post_init__(self):
element_size = self.element_coords.shape[0]
for attr in ("element_probs", "element_class_ids", "texts"):
# NOTE: maybe we should create an attribute _optional_attributes: list[str] to store this
# list
for attr in (
"element_probs",
"element_class_ids",
"texts",
"sources",
"text_as_html",
"table_as_cells",
):
if getattr(self, attr).size == 0 and element_size:
setattr(self, attr, np.array([None] * element_size))

Expand All @@ -54,31 +65,37 @@ def __eq__(self, other: object) -> bool:
[self.element_class_id_map[idx] for idx in self.element_class_ids]
== [other.element_class_id_map[idx] for idx in other.element_class_ids]
)
and self.source == other.source
and np.array_equal(self.sources[mask], other.sources[mask])
and np.array_equal(self.text_as_html[mask], other.text_as_html[mask])
and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask])
)

def slice(self, indices) -> LayoutElements:
"""slice and return only selected indices"""
return LayoutElements(
element_coords=self.element_coords[indices],
texts=self.texts[indices],
source=self.source,
sources=self.sources[indices],
element_probs=self.element_probs[indices],
element_class_ids=self.element_class_ids[indices],
element_class_id_map=self.element_class_id_map,
text_as_html=self.text_as_html[indices],
table_as_cells=self.table_as_cells[indices],
)

@classmethod
def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements:
"""concatenate a sequence of LayoutElements in order as one LayoutElements"""
coords, texts, probs, class_ids, sources = [], [], [], [], []
text_as_html, table_as_cells = [], []
class_id_reverse_map: dict[str, int] = {}
for group in groups:
coords.append(group.element_coords)
texts.append(group.texts)
probs.append(group.element_probs)
if group.source:
sources.append(group.source)
sources.append(group.sources)
text_as_html.append(group.text_as_html)
table_as_cells.append(group.table_as_cells)

idx = group.element_class_ids.copy()
if group.element_class_id_map:
Expand All @@ -97,13 +114,24 @@ def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements:
element_probs=np.concatenate(probs),
element_class_ids=np.concatenate(class_ids),
element_class_id_map={v: k for k, v in class_id_reverse_map.items()},
source=sources[0] if sources else None,
sources=np.concatenate(sources),
text_as_html=np.concatenate(text_as_html),
table_as_cells=np.concatenate(table_as_cells),
)

def as_list(self):
"""return a list of LayoutElement for backward compatibility"""
return [
LayoutElement.from_coords(
def iter_elements(self):
"""iter elements as one LayoutElement per iteration; this returns a generator and has less
memory impact than the as_list method"""
for (x1, y1, x2, y2), text, prob, class_id, source, text_as_html, table_as_cells in zip(
self.element_coords,
self.texts,
self.element_probs,
self.element_class_ids,
self.sources,
self.text_as_html,
self.table_as_cells,
):
yield LayoutElement.from_coords(
x1,
y1,
x2,
Expand All @@ -115,15 +143,10 @@ def as_list(self):
else None
),
prob=None if np.isnan(prob) else prob,
source=self.source,
source=source,
text_as_html=text_as_html,
table_as_cells=table_as_cells,
)
for (x1, y1, x2, y2), text, prob, class_id in zip(
self.element_coords,
self.texts,
self.element_probs,
self.element_class_ids,
)
]

@classmethod
def from_list(cls, elements: list):
Expand All @@ -133,13 +156,15 @@ def from_list(cls, elements: list):
coords = np.empty((len_ele, 4), dtype=float)
# text and probs can be Nones so use lists first then convert into array to avoid them being
# filled as nan
texts = []
class_probs = []
texts, text_as_html, table_as_cells, sources, class_probs = [], [], [], [], []
class_types = np.empty((len_ele,), dtype="object")

for i, element in enumerate(elements):
coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2]
texts.append(element.text)
sources.append(element.source)
text_as_html.append(element.text_as_html)
table_as_cells.append(element.table_as_cells)
class_probs.append(element.prob)
class_types[i] = element.type or "None"

Expand All @@ -152,7 +177,9 @@ def from_list(cls, elements: list):
element_probs=np.array(class_probs),
element_class_ids=class_ids,
element_class_id_map=dict(zip(range(len(unique_ids)), unique_ids)),
source=elements[0].source if len_ele else None,
sources=np.array(sources),
text_as_html=np.array(text_as_html),
table_as_cells=np.array(table_as_cells),
)


Expand All @@ -162,6 +189,8 @@ class LayoutElement(TextRegion):
prob: Optional[float] = None
image_path: Optional[str] = None
parent: Optional[LayoutElement] = None
text_as_html: Optional[str] = None
table_as_cells: Optional[str] = None

def to_dict(self) -> dict:
"""Converts the class instance to dictionary form."""
Expand Down Expand Up @@ -432,9 +461,8 @@ def clean_layoutelements(elements: LayoutElements, subregion_threshold: float =

final_attrs: dict[str, Any] = {
"element_class_id_map": elements.element_class_id_map,
"source": elements.source,
}
for attr in ("element_class_ids", "element_probs", "texts"):
for attr in ("element_class_ids", "element_probs", "texts", "sources"):
if (original_attr := getattr(elements, attr)) is None:
continue
final_attrs[attr] = original_attr[sorted_by_area][mask][sorted_by_y1]
Expand Down Expand Up @@ -510,7 +538,7 @@ def clean_layoutelements_for_class(

final_coords = np.vstack([target_coords[mask], other_coords[other_mask]])
final_attrs: dict[str, Any] = {"element_class_id_map": elements.element_class_id_map}
for attr in ("element_class_ids", "element_probs", "texts"):
for attr in ("element_class_ids", "element_probs", "texts", "sources"):
if (original_attr := getattr(elements, attr)) is None:
continue
final_attrs[attr] = np.concatenate(
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def image_processing(
element_probs=sorted_dets[:, 4].astype(float),
element_class_ids=sorted_dets[:, 5].astype(int),
element_class_id_map=self.layout_classes,
source=Source.YOLOX,
sources=np.array([Source.YOLOX] * sorted_dets.shape[0]),
)


Expand Down
Loading