Skip to content

Commit 52c5bea

Browse files
badGarnetCloud User
andauthored
fix table to html bug (#203)
This PR fixes a bug where tables with cells that spans multiple rows can cause the rows below/aligned with them position incorrectly. The original code assumes the first cell from a row (i.e., the cell that increments the row number) is also the first column. But for tables like the one below (also in this PR as a new sample doc): ![table-multi-row-column-cells](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/2a664afd-58c5-4a34-afb3-fbb3f1305e43) The last first 4 columns has headers spanning two rows but the last two columns have headers that has two rows. With the current code the second row (the subheaders) of the last two columns are treated as the subheaders for the first two columns: ![Screenshot 2023-09-08 at 11 59 32 AM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/186ec1d2-8621-4a3d-9025-fe8ca05876e7) This PR fixes this bug by filling in cells with empty text to occupy the cells at row 2, columns 1 through 4. Those are the cells that are compounded into the header. And the resulting html represents the table structure better: ![Screenshot 2023-09-08 at 12 01 26 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/aaa872bd-9e5a-4b25-8973-29335ec60af5) --------- Co-authored-by: Cloud User <[email protected]>
1 parent 7cd724a commit 52c5bea

File tree

4 files changed

+55
-6
lines changed

4 files changed

+55
-6
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.25
2+
3+
* fix a bug where `cells_to_html` doesn't handle cells spanning multiple rows properly
4+
15
## 0.5.24
26

37
* remove `cv2` preprocessing step before OCR step in table transformer

test_unstructured_inference/models/test_tables.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,3 +543,23 @@ def test_extract_text_from_spans(spans, join_with_space, expected):
543543
def test_header_supercell_tree(supercells, expected_len):
544544
postprocess.header_supercell_tree(supercells)
545545
assert len(supercells) == expected_len
546+
547+
548+
def test_cells_to_html():
549+
# example table
550+
# +----------+---------------------+
551+
# | two | two columns |
552+
# | |----------+----------|
553+
# | rows |sub cell 1|sub cell 2|
554+
# +----------+----------+----------+
555+
cells = [
556+
{"row_nums": [0, 1], "column_nums": [0], "cell text": "two row", "column header": False},
557+
{"row_nums": [0], "column_nums": [1, 2], "cell text": "two cols", "column header": False},
558+
{"row_nums": [1], "column_nums": [1], "cell text": "sub cell 1", "column header": False},
559+
{"row_nums": [1], "column_nums": [2], "cell text": "sub cell 2", "column header": False},
560+
]
561+
expected = (
562+
'<table><tr><td rowspan="2">two row</td><td colspan="2">two '
563+
"cols</td></tr><tr><td></td><td>sub cell 1</td><td>sub cell 2</td></tr></table>"
564+
)
565+
assert tables.cells_to_html(cells) == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.24" # pragma: no cover
1+
__version__ = "0.5.25" # pragma: no cover

unstructured_inference/models/tables.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import xml.etree.ElementTree as ET
66
from collections import defaultdict
77
from pathlib import Path
8-
from typing import Optional, Union
8+
from typing import List, Optional, Union
99

1010
import numpy as np
1111
import pandas as pd
@@ -108,8 +108,6 @@ def run_prediction(self, x: Image):
108108

109109
tokens = self.get_tokens(x=x)
110110

111-
sorted(tokens, key=lambda x: x["bbox"][1] * 10000 + x["bbox"][0])
112-
113111
# 'tokens' is a list of tokens
114112
# Need to be in a relative reading order
115113
# If no order is provided, use current order
@@ -595,10 +593,37 @@ def structure_to_cells(table_structure, tokens):
595593
return cells, confidence_score
596594

597595

596+
def fill_cells(cells: List[dict]) -> List[dict]:
597+
"""add empty cells to pad cells that spans multiple rows for html conversion
598+
599+
For example if a cell takes row 0 and 1 and column 0, we add a new empty cell at row 1 and
600+
column 0. This padding ensures the structure of the output table is intact. In this example the
601+
cell data is {"row_nums": [0, 1], "column_nums": [0], ...}
602+
603+
A cell contains the following keys relevent to the html conversion:
604+
row_nums: List[int]
605+
the row numbers this cell belongs to; for cells spanning multiple rows there are more than
606+
one numbers
607+
column_nums: List[int]
608+
the columns numbers this cell belongs to; for cells spanning multiple columns there are more
609+
than one numbers
610+
cell text: str
611+
the text in this cell
612+
613+
"""
614+
new_cells = cells.copy()
615+
for cell in cells:
616+
for extra_row in sorted(cell["row_nums"][1:]):
617+
new_cell = cell.copy()
618+
new_cell["row_nums"] = [extra_row]
619+
new_cell["cell text"] = ""
620+
new_cells.append(new_cell)
621+
return new_cells
622+
623+
598624
def cells_to_html(cells):
599625
"""Convert table structure to html format."""
600-
cells = sorted(cells, key=lambda k: min(k["column_nums"]))
601-
cells = sorted(cells, key=lambda k: min(k["row_nums"]))
626+
cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
602627

603628
table = ET.Element("table")
604629
current_row = -1

0 commit comments

Comments
 (0)