Skip to content

Commit 86bbb37

Browse files
authored
feat: move table extraction output (#88)
Changed the table extraction interface. Instead of outputting the HTML table structure to the text property, the HTML structure is now stored in html_as_text, while the contents of text is the normal result of text extraction from an element.
1 parent b97e406 commit 86bbb37

File tree

4 files changed

+20
-17
lines changed

4 files changed

+20
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.4.2
2+
3+
* Output of table extraction is now stored in `text_as_html` property rather than `text` property
4+
15
## 0.4.1
26

37
* Added the ability to pass `ocr_languages` to the OCR agent for users who need
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.1" # pragma: no cover
1+
__version__ = "0.4.2" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,11 @@ def get_element_from_block(
279279
) -> LayoutElement:
280280
"""Creates a LayoutElement from a given layout or image by finding all the text that lies within
281281
a given block."""
282-
element = LayoutElement.from_region(block)
283-
element.text = block.extract_text(
282+
if isinstance(block, LayoutElement):
283+
element = block
284+
else:
285+
element = LayoutElement.from_region(block)
286+
element.text = element.extract_text(
284287
objects=pdf_objects,
285288
image=image,
286289
extract_tables=extract_tables,

unstructured_inference/inference/layoutelement.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,15 @@ def extract_text(
2222
ocr_languages: str = "eng",
2323
):
2424
"""Extracts text contained in region"""
25-
if self.text is not None:
26-
# If block text is already populated, we'll assume it's correct
27-
text = self.text
28-
elif extract_tables and isinstance(self, LayoutElement) and self.type == "Table":
29-
text = interprete_table_block(self, image)
30-
else:
31-
text = super().extract_text(
32-
objects=objects,
33-
image=image,
34-
extract_tables=extract_tables,
35-
ocr_strategy=ocr_strategy,
36-
ocr_languages=ocr_languages,
37-
)
25+
text = super().extract_text(
26+
objects=objects,
27+
image=image,
28+
extract_tables=extract_tables,
29+
ocr_strategy=ocr_strategy,
30+
ocr_languages=ocr_languages,
31+
)
32+
if extract_tables and self.type == "Table":
33+
self.text_as_html = interpret_table_block(self, image)
3834
return text
3935

4036
def to_dict(self) -> dict:
@@ -63,7 +59,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
6359
return cls(x1, y1, x2, y2, text, type)
6460

6561

66-
def interprete_table_block(text_block: TextRegion, image: Image.Image) -> str:
62+
def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
6763
"""Extract the contents of a table."""
6864
tables.load_agent()
6965
if tables.tables_agent is None:

0 commit comments

Comments
 (0)