|
5 | 5 | import xml.etree.ElementTree as ET |
6 | 6 | from collections import defaultdict |
7 | 7 | from pathlib import Path |
8 | | -from typing import Optional, Union |
| 8 | +from typing import List, Optional, Union |
9 | 9 |
|
10 | 10 | import numpy as np |
11 | 11 | import pandas as pd |
@@ -108,8 +108,6 @@ def run_prediction(self, x: Image): |
108 | 108 |
|
109 | 109 | tokens = self.get_tokens(x=x) |
110 | 110 |
|
111 | | - sorted(tokens, key=lambda x: x["bbox"][1] * 10000 + x["bbox"][0]) |
112 | | - |
113 | 111 | # 'tokens' is a list of tokens |
114 | 112 | # Need to be in a relative reading order |
115 | 113 | # If no order is provided, use current order |
@@ -595,10 +593,37 @@ def structure_to_cells(table_structure, tokens): |
595 | 593 | return cells, confidence_score |
596 | 594 |
|
597 | 595 |
|
| 596 | +def fill_cells(cells: List[dict]) -> List[dict]: |
| 597 | + """add empty cells to pad cells that spans multiple rows for html conversion |
| 598 | +
|
| 599 | + For example if a cell takes row 0 and 1 and column 0, we add a new empty cell at row 1 and |
| 600 | + column 0. This padding ensures the structure of the output table is intact. In this example the |
| 601 | + cell data is {"row_nums": [0, 1], "column_nums": [0], ...} |
| 602 | +
|
| 603 | + A cell contains the following keys relevent to the html conversion: |
| 604 | + row_nums: List[int] |
| 605 | + the row numbers this cell belongs to; for cells spanning multiple rows there are more than |
| 606 | + one numbers |
| 607 | + column_nums: List[int] |
| 608 | + the columns numbers this cell belongs to; for cells spanning multiple columns there are more |
| 609 | + than one numbers |
| 610 | + cell text: str |
| 611 | + the text in this cell |
| 612 | +
|
| 613 | + """ |
| 614 | + new_cells = cells.copy() |
| 615 | + for cell in cells: |
| 616 | + for extra_row in sorted(cell["row_nums"][1:]): |
| 617 | + new_cell = cell.copy() |
| 618 | + new_cell["row_nums"] = [extra_row] |
| 619 | + new_cell["cell text"] = "" |
| 620 | + new_cells.append(new_cell) |
| 621 | + return new_cells |
| 622 | + |
| 623 | + |
598 | 624 | def cells_to_html(cells): |
599 | 625 | """Convert table structure to html format.""" |
600 | | - cells = sorted(cells, key=lambda k: min(k["column_nums"])) |
601 | | - cells = sorted(cells, key=lambda k: min(k["row_nums"])) |
| 626 | + cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"]))) |
602 | 627 |
|
603 | 628 | table = ET.Element("table") |
604 | 629 | current_row = -1 |
|
0 commit comments