feat: move table extraction output (#88)

qued · web-flow · commit 86bbb37af35c · 2023-04-20T23:03:15.000-05:00
Changed the table extraction interface. Instead of outputting the HTML table structure to the text property, the HTML structure is now stored in html_as_text, while the contents of text is the normal result of text extraction from an element.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.2
+
+* Output of table extraction is now stored in `text_as_html` property rather than `text` property
+
 ## 0.4.1
 
 * Added the ability to pass `ocr_languages` to the OCR agent for users who need
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.1"  # pragma: no cover
+__version__ = "0.4.2"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -279,8 +279,11 @@ def get_element_from_block(
 ) -> LayoutElement:
     """Creates a LayoutElement from a given layout or image by finding all the text that lies within
     a given block."""
-    element = LayoutElement.from_region(block)
-    element.text = block.extract_text(
+    if isinstance(block, LayoutElement):
+        element = block
+    else:
+        element = LayoutElement.from_region(block)
+    element.text = element.extract_text(
         objects=pdf_objects,
         image=image,
         extract_tables=extract_tables,
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -22,19 +22,15 @@ def extract_text(
         ocr_languages: str = "eng",
     ):
         """Extracts text contained in region"""
-        if self.text is not None:
-            # If block text is already populated, we'll assume it's correct
-            text = self.text
-        elif extract_tables and isinstance(self, LayoutElement) and self.type == "Table":
-            text = interprete_table_block(self, image)
-        else:
-            text = super().extract_text(
-                objects=objects,
-                image=image,
-                extract_tables=extract_tables,
-                ocr_strategy=ocr_strategy,
-                ocr_languages=ocr_languages,
-            )
+        text = super().extract_text(
+            objects=objects,
+            image=image,
+            extract_tables=extract_tables,
+            ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
+        )
+        if extract_tables and self.type == "Table":
+            self.text_as_html = interpret_table_block(self, image)
         return text
 
     def to_dict(self) -> dict:
@@ -63,7 +59,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
         return cls(x1, y1, x2, y2, text, type)
 
 
-def interprete_table_block(text_block: TextRegion, image: Image.Image) -> str:
+def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
     """Extract the contents of a table."""
     tables.load_agent()
     if tables.tables_agent is None:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.1" # pragma: no cover`
	`1`	`+__version__ = "0.4.2" # pragma: no cover`