fix table to html bug (#203)

badGarnet · Cloud User · web-flow · commit 52c5bea07685 · 2023-09-08T20:23:43.000-05:00
This PR fixes a bug where tables with cells that spans multiple rows can cause the rows below/aligned with them position incorrectly. The original code assumes the first cell from a row (i.e., the cell that increments the row number) is also the first column. But for tables like the one below (also in this PR as a new sample doc): ![table-multi-row-column-cells](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/2a664afd-58c5-4a34-afb3-fbb3f1305e43) The last first 4 columns has headers spanning two rows but the last two columns have headers that has two rows. With the current code the second row (the subheaders) of the last two columns are treated as the subheaders for the first two columns: ![Screenshot 2023-09-08 at 11 59 32 AM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/186ec1d2-8621-4a3d-9025-fe8ca05876e7) This PR fixes this bug by filling in cells with empty text to occupy the cells at row 2, columns 1 through 4. Those are the cells that are compounded into the header. And the resulting html represents the table structure better: ![Screenshot 2023-09-08 at 12 01 26 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/aaa872bd-9e5a-4b25-8973-29335ec60af5) --------- Co-authored-by: Cloud User <rocky@ip-10-4-124-58.us-east-2.compute.internal>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.25
+
+* fix a bug where `cells_to_html` doesn't handle cells spanning multiple rows properly
+
 ## 0.5.24
 
 * remove `cv2` preprocessing step before OCR step in table transformer
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -543,3 +543,23 @@ def test_extract_text_from_spans(spans, join_with_space, expected):
 def test_header_supercell_tree(supercells, expected_len):
     postprocess.header_supercell_tree(supercells)
     assert len(supercells) == expected_len
+
+
+def test_cells_to_html():
+    # example table
+    # +----------+---------------------+
+    # |    two   |   two columns       |
+    # |          |----------+----------|
+    # |    rows  |sub cell 1|sub cell 2|
+    # +----------+----------+----------+
+    cells = [
+        {"row_nums": [0, 1], "column_nums": [0], "cell text": "two row", "column header": False},
+        {"row_nums": [0], "column_nums": [1, 2], "cell text": "two cols", "column header": False},
+        {"row_nums": [1], "column_nums": [1], "cell text": "sub cell 1", "column header": False},
+        {"row_nums": [1], "column_nums": [2], "cell text": "sub cell 2", "column header": False},
+    ]
+    expected = (
+        '<table><tr><td rowspan="2">two row</td><td colspan="2">two '
+        "cols</td></tr><tr><td></td><td>sub cell 1</td><td>sub cell 2</td></tr></table>"
+    )
+    assert tables.cells_to_html(cells) == expected
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.24"  # pragma: no cover
+__version__ = "0.5.25"  # pragma: no cover
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -5,7 +5,7 @@
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from pathlib import Path
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -108,8 +108,6 @@ def run_prediction(self, x: Image):
 
         tokens = self.get_tokens(x=x)
 
-        sorted(tokens, key=lambda x: x["bbox"][1] * 10000 + x["bbox"][0])
-
         # 'tokens' is a list of tokens
         # Need to be in a relative reading order
         # If no order is provided, use current order
@@ -595,10 +593,37 @@ def structure_to_cells(table_structure, tokens):
     return cells, confidence_score
 
 
+def fill_cells(cells: List[dict]) -> List[dict]:
+    """add empty cells to pad cells that spans multiple rows for html conversion
+
+    For example if a cell takes row 0 and 1 and column 0, we add a new empty cell at row 1 and
+    column 0. This padding ensures the structure of the output table is intact. In this example the
+    cell data is {"row_nums": [0, 1], "column_nums": [0], ...}
+
+    A cell contains the following keys relevent to the html conversion:
+    row_nums: List[int]
+        the row numbers this cell belongs to; for cells spanning multiple rows there are more than
+        one numbers
+    column_nums: List[int]
+        the columns numbers this cell belongs to; for cells spanning multiple columns there are more
+        than one numbers
+    cell text: str
+        the text in this cell
+
+    """
+    new_cells = cells.copy()
+    for cell in cells:
+        for extra_row in sorted(cell["row_nums"][1:]):
+            new_cell = cell.copy()
+            new_cell["row_nums"] = [extra_row]
+            new_cell["cell text"] = ""
+            new_cells.append(new_cell)
+    return new_cells
+
+
 def cells_to_html(cells):
     """Convert table structure to html format."""
-    cells = sorted(cells, key=lambda k: min(k["column_nums"]))
-    cells = sorted(cells, key=lambda k: min(k["row_nums"]))
+    cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
 
     table = ET.Element("table")
     current_row = -1

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.24" # pragma: no cover`
	`1`	`+__version__ = "0.5.25" # pragma: no cover`