Skip to content

Commit 9383bac

Browse files
quedcodeflash-ai[bot]aseembits93
authored
enhancement: optimize cells to html (#444)
<!-- CODEFLASH_OPTIMIZATION: {"function":"cells_to_html","file":"unstructured_inference/models/tables.py","speedup_pct":"8%","speedup_x":"0.08x","original_runtime":"14.0 milliseconds","best_runtime":"13.0 milliseconds","optimization_type":"loop","timestamp":"2025-08-27T02:04:29.140Z","version":"1.0"} --> ### 📄 8% (0.08x) speedup for ***`cells_to_html` in `unstructured_inference/models/tables.py`*** ⏱️ Runtime : **`14.0 milliseconds`** **→** **`13.0 milliseconds`** (best of `193` runs) ### 📝 Explanation and details The optimized code achieves a 7% speedup through two key optimizations in the `fill_cells` function: **1. Replaced NumPy with native Python data structures:** - Removed `np.zeros()` for creating a boolean grid and `np.where()` for finding empty cells - Used a Python `set()` to track filled positions with `filled.add((row, col))` instead of `filled[row, col] = True` - This eliminates NumPy import overhead and array allocation costs, while providing O(1) membership checks **2. Optimized header row detection:** - Replaced set comprehension `{row for cell in cells if cell["column header"] for row in cell["row_nums"]}` with explicit loop and `set.update()` - This avoids creating intermediate iterables and reduces function call overhead **3. Direct iteration instead of NumPy indexing:** - Replaced `zip(not_filled_idx[0], not_filled_idx[1])` with nested `for row in range()` loops - This eliminates array indexing operations and provides cleaner iteration The optimizations are particularly effective for **small to medium tables** (as shown in test results where single cells see 40-56% speedup) because: - NumPy has fixed overhead that's not justified for small boolean grids - Set operations are highly optimized in Python for sparse data patterns - Direct loops avoid intermediate array allocations For **large dense tables** (20x20), the performance is roughly equivalent, showing the optimizations don't hurt scalability while providing significant gains for typical table sizes. ✅ **Correctness verification report:** | Test | Status | | --------------------------- | ----------------- | | ⚙️ Existing Unit Tests | ✅ **31 Passed** | | 🌀 Generated Regression Tests | ✅ **38 Passed** | | ⏪ Replay Tests | ✅ **8 Passed** | | 🔎 Concolic Coverage Tests | 🔘 **None Found** | |📊 Tests Coverage | 100.0% | <details> <summary>⚙️ Existing Unit Tests and Runtime</summary> | Test File::Test Function | Original ⏱️ | Optimized ⏱️ | Speedup | |:--------------------------------------------|:--------------|:---------------|:----------| | `models/test_tables.py::test_cells_to_html` | 479μs | 374μs | 28.0%✅ | </details> <details> <summary>🌀 Generated Regression Tests and Runtime</summary> ```python import xml.etree.ElementTree as ET from typing import List import numpy as np # imports import pytest # used for our unit tests from unstructured_inference.models.tables import cells_to_html # BASIC TEST CASES def test_empty_cells_returns_empty_table(): # Test with empty input, should return an empty table codeflash_output = cells_to_html([]); html = codeflash_output # 26.9μs -> 26.8μs (0.399% faster) def test_single_cell_table(): # Test a table with one cell cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 63.1μs -> 40.4μs (56.1% faster) def test_single_header_cell_table(): # Table with one header cell cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "Header", "column header": True} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 64.7μs -> 43.3μs (49.3% faster) def test_simple_2x2_table(): # Table with 2 rows and 2 columns, no headers cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [0], "column_nums": [1], "cell text": "B", "column header": False}, {"row_nums": [1], "column_nums": [0], "cell text": "C", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "D", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 76.3μs -> 54.2μs (40.6% faster) def test_simple_2x2_table_with_header(): # Table with 2x2, first row is header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "H1", "column header": True}, {"row_nums": [0], "column_nums": [1], "cell text": "H2", "column header": True}, {"row_nums": [1], "column_nums": [0], "cell text": "C1", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "C2", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 77.8μs -> 56.9μs (36.9% faster) def test_colspan_and_rowspan(): # Table with a cell spanning 2 columns and 2 rows cells = [ {"row_nums": [0,1], "column_nums": [0,1], "cell text": "Span", "column header": True} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 71.0μs -> 49.6μs (43.1% faster) def test_mixed_colspan_rowspan_table(): # Table with mixed spans cells = [ {"row_nums": [0], "column_nums": [0,1], "cell text": "Header", "column header": True}, {"row_nums": [1], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "B", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 79.1μs -> 58.0μs (36.4% faster) def test_fill_cells_adds_missing_cells(): # Table with missing cell, should fill with empty cell cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "B", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 81.6μs -> 55.5μs (47.1% faster) # EDGE TEST CASES def test_non_contiguous_rows_and_columns(): # Table with non-contiguous row/col indices (should fill gaps) cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [2], "column_nums": [2], "cell text": "B", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 98.3μs -> 70.0μs (40.4% faster) def test_multiple_header_rows(): # Table with two header rows cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "H1", "column header": True}, {"row_nums": [1], "column_nums": [0], "cell text": "H2", "column header": True}, {"row_nums": [2], "column_nums": [0], "cell text": "A", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 77.6μs -> 55.9μs (38.8% faster) def test_cell_with_empty_text_and_header(): # Cell with empty text but marked as header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "", "column header": True} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 64.3μs -> 42.6μs (50.9% faster) def test_cell_with_empty_text_and_non_header(): # Cell with empty text, not header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 61.4μs -> 39.6μs (54.8% faster) def test_cell_with_multiple_row_and_col_spans(): # Cell spanning multiple rows and columns cells = [ {"row_nums": [0,1,2], "column_nums": [0,1], "cell text": "Big", "column header": True} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 71.6μs -> 50.5μs (41.9% faster) def test_cells_with_overlapping_spans(): # Overlapping spans should be handled by fill_cells cells = [ {"row_nums": [0,1], "column_nums": [0], "cell text": "A", "column header": True}, {"row_nums": [1], "column_nums": [0,1], "cell text": "B", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 86.3μs -> 61.2μs (40.9% faster) def test_cells_with_nonzero_start_indices(): # Table where rows/cols start at nonzero indices cells = [ {"row_nums": [2], "column_nums": [3], "cell text": "X", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 105μs -> 76.7μs (37.1% faster) # LARGE SCALE TEST CASES def test_large_table_20x20(): # Table with 20x20 cells, all filled cells = [ {"row_nums": [i], "column_nums": [j], "cell text": f"R{i}C{j}", "column header": False} for i in range(20) for j in range(20) ] codeflash_output = cells_to_html(cells); html = codeflash_output # 1.19ms -> 1.20ms (0.995% slower) def test_large_header_table_10x10(): # Table with 10x10 cells, first row is header cells = [ {"row_nums": [0], "column_nums": [j], "cell text": f"H{j}", "column header": True} for j in range(10) ] + [ {"row_nums": [i], "column_nums": [j], "cell text": f"R{i}C{j}", "column header": False} for i in range(1,10) for j in range(10) ] codeflash_output = cells_to_html(cells); html = codeflash_output # 347μs -> 336μs (3.16% faster) def test_sparse_large_table(): # Table with 20x20, only diagonal filled cells = [ {"row_nums": [i], "column_nums": [i], "cell text": f"D{i}", "column header": False} for i in range(20) ] codeflash_output = cells_to_html(cells); html = codeflash_output # 1.20ms -> 1.10ms (9.04% faster) for i in range(20): pass def test_large_table_with_spans(): # Table with 10x10, first cell spans 10 columns cells = [ {"row_nums": [0], "column_nums": list(range(10)), "cell text": "Header", "column header": True} ] + [ {"row_nums": [i], "column_nums": [j], "cell text": f"R{i}C{j}", "column header": False} for i in range(1,10) for j in range(10) ] codeflash_output = cells_to_html(cells); html = codeflash_output # 329μs -> 316μs (4.06% faster) def test_large_table_with_missing_cells(): # Table with 30x30, only first row filled, rest empty cells = [ {"row_nums": [0], "column_nums": [j], "cell text": f"H{j}", "column header": True} for j in range(30) ] codeflash_output = cells_to_html(cells); html = codeflash_output # 149μs -> 130μs (15.1% faster) # codeflash_output is used to check that the output of the original code is the same as that of the optimized code. #------------------------------------------------ import xml.etree.ElementTree as ET from typing import List # function to test import numpy as np # imports import pytest # used for our unit tests from unstructured_inference.models.tables import cells_to_html ########## # BASIC TEST CASES ########## def test_single_cell_no_header(): # 1x1 table, no header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 62.5μs -> 40.8μs (53.0% faster) def test_single_cell_with_header(): # 1x1 table, header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "Header", "column header": True} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 64.3μs -> 43.0μs (49.4% faster) def test_simple_2x2_table_no_header(): # 2x2 table, no header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [0], "column_nums": [1], "cell text": "B", "column header": False}, {"row_nums": [1], "column_nums": [0], "cell text": "C", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "D", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 76.3μs -> 55.0μs (38.9% faster) def test_simple_2x2_table_with_header(): # 2x2 table, first row is header cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "H1", "column header": True}, {"row_nums": [0], "column_nums": [1], "cell text": "H2", "column header": True}, {"row_nums": [1], "column_nums": [0], "cell text": "C1", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "C2", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 77.8μs -> 57.5μs (35.4% faster) def test_cell_with_colspan_and_rowspan(): # Cell spans two columns and two rows cells = [ {"row_nums": [0, 1], "column_nums": [0, 1], "cell text": "Span", "column header": True}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 70.7μs -> 49.4μs (43.0% faster) def test_mixed_rowspan_and_colspan(): # Table with mixed spans cells = [ {"row_nums": [0], "column_nums": [0, 1], "cell text": "H", "column header": True}, {"row_nums": [1, 2], "column_nums": [0], "cell text": "V", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "C", "column header": False}, {"row_nums": [2], "column_nums": [1], "cell text": "D", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 85.6μs -> 65.6μs (30.5% faster) ########## # EDGE TEST CASES ########## def test_empty_cells_list(): # No cells, should return empty table codeflash_output = cells_to_html([]); html = codeflash_output # 26.2μs -> 26.0μs (0.561% faster) def test_missing_cells_filled_with_empty(): # Only some cells provided, fill_cells should add empty ones cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, # missing [0,1], [1,0], [1,1] ] codeflash_output = cells_to_html(cells); html = codeflash_output # 62.3μs -> 40.5μs (54.0% faster) def test_header_row_with_missing_cells(): # Header row incomplete, should fill empty header cell cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "H1", "column header": True}, # missing [0,1] header cell {"row_nums": [1], "column_nums": [0], "cell text": "C1", "column header": False}, {"row_nums": [1], "column_nums": [1], "cell text": "C2", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 83.4μs -> 57.8μs (44.1% faster) def test_multiple_header_rows(): # Multiple header rows cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "H1", "column header": True}, {"row_nums": [1], "column_nums": [0], "cell text": "H2", "column header": True}, {"row_nums": [2], "column_nums": [0], "cell text": "C1", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 76.9μs -> 55.5μs (38.6% faster) def test_non_sequential_row_and_col_indices(): # Cells with non-sequential row/col indices cells = [ {"row_nums": [2], "column_nums": [2], "cell text": "X", "column header": False} ] codeflash_output = cells_to_html(cells); html = codeflash_output # 96.7μs -> 69.4μs (39.2% faster) def test_empty_cell_text_and_whitespace(): # Cell with empty string and whitespace cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "", "column header": False}, {"row_nums": [0], "column_nums": [1], "cell text": " ", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 67.6μs -> 45.3μs (49.3% faster) def test_cells_with_duplicate_positions(): # Multiple cells at same position (should be sorted and all present) cells = [ {"row_nums": [0], "column_nums": [0], "cell text": "A", "column header": False}, {"row_nums": [0], "column_nums": [0], "cell text": "B", "column header": False}, ] codeflash_output = cells_to_html(cells); html = codeflash_output # 66.9μs -> 44.9μs (49.1% faster) ########## # LARGE SCALE TEST CASES ########## def test_large_table_20x20(): # 20x20 table, no header cells = [] for r in range(20): for c in range(20): cells.append({"row_nums": [r], "column_nums": [c], "cell text": f"{r},{c}", "column header": False}) codeflash_output = cells_to_html(cells); html = codeflash_output # 1.23ms -> 1.24ms (0.982% slower) def test_large_table_with_header_10x10(): # 10x10 table, first row is header cells = [] for c in range(10): cells.append({"row_nums": [0], "column_nums": [c], "cell text": f"H{c}", "column header": True}) for r in range(1, 10): for c in range(10): cells.append({"row_nums": [r], "column_nums": [c], "cell text": f"{r},{c}", "column header": False}) codeflash_output = cells_to_html(cells); html = codeflash_output # 347μs -> 335μs (3.54% faster) def test_large_table_with_spans(): # 10x10 table, every cell in first row spans 2 columns cells = [] for c in range(0, 10, 2): cells.append({"row_nums": [0], "column_nums": [c, c+1], "cell text": f"H{c}", "column header": True}) for r in range(1, 10): for c in range(10): cells.append({"row_nums": [r], "column_nums": [c], "cell text": f"{r},{c}", "column header": False}) codeflash_output = cells_to_html(cells); html = codeflash_output # 344μs -> 331μs (3.73% faster) def test_large_table_with_missing_cells(): # 30x30 table, only diagonal cells provided cells = [] for i in range(30): cells.append({"row_nums": [i], "column_nums": [i], "cell text": f"{i}", "column header": False}) codeflash_output = cells_to_html(cells); html = codeflash_output # 2.61ms -> 2.44ms (6.72% faster) for i in range(30): pass def test_performance_large_table(monkeypatch): # Performance: 50x20 table, time should be reasonable import time cells = [] for r in range(50): for c in range(20): cells.append({"row_nums": [r], "column_nums": [c], "cell text": f"{r},{c}", "column header": False}) start = time.time() codeflash_output = cells_to_html(cells); html = codeflash_output # 2.88ms -> 2.97ms (3.02% slower) elapsed = time.time() - start # codeflash_output is used to check that the output of the original code is the same as that of the optimized code. ``` </details> <details> <summary>⏪ Replay Tests and Runtime</summary> | Test File::Test Function | Original ⏱️ | Optimized ⏱️ | Speedup | |:---------------------------------------------------------------------------------------------------------------------|:--------------|:---------------|:----------| | `test_pytest_test_unstructured_inference__replay_test_0.py::test_unstructured_inference_models_tables_cells_to_html` | 829μs | 738μs | 12.3%✅ | </details> To edit these changes `git checkout codeflash/optimize-cells_to_html-metc0l2u` and push. [![Codeflash](https://img.shields.io/badge/Optimized%20with-Codeflash-yellow?style=flat&color=%23ffc428&logo=)](https://codeflash.ai) <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Optimize table HTML generation by replacing NumPy grid logic with native sets/loops and minor sorting/header handling tweaks; update version and changelog. > > - **Tables HTML generation (`unstructured_inference/models/tables.py`)** > - **`fill_cells`**: Replace NumPy grid/where with native `set` tracking, explicit header row accumulation, and nested loops to append missing cells. > - **`cells_to_html`**: Precompute `cells_filled` and `cells_sorted`; adjust header detection/`thead` creation; iterate over sorted cells for row building. > - **Versioning** > - Bump `__version__` to `1.0.8-dev1` in `unstructured_inference/__version__.py`. > - Update `CHANGELOG.md` with enhancement note for optimized `cells_to_html`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 640b75c. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> --------- Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com> Co-authored-by: aseembits93 <[email protected]>
1 parent 0c9a78b commit 9383bac

File tree

3 files changed

+37
-26
lines changed

3 files changed

+37
-26
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 1.0.8-dev0
1+
## 1.0.8-dev1
22

3+
* Enhancement: Optimized `cells_to_html` for an 8% speedup in some cases (codeflash)
34
* Enhancement: Optimized `outputs_to_objects` for an 88% speedup in some cases (codeflash)
45

56
## 1.0.7
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.8-dev0" # pragma: no cover
1+
__version__ = "1.0.8-dev1" # pragma: no cover

unstructured_inference/models/tables.py

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -690,25 +690,32 @@ def fill_cells(cells: List[dict]) -> List[dict]:
690690
if not cells:
691691
return []
692692

693-
table_rows_no = max({row for cell in cells for row in cell["row_nums"]})
694-
table_cols_no = max({col for cell in cells for col in cell["column_nums"]})
695-
filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool)
693+
# Find max row and col indices
694+
max_row = max(row for cell in cells for row in cell["row_nums"])
695+
max_col = max(col for cell in cells for col in cell["column_nums"])
696+
filled = set()
696697
for cell in cells:
697698
for row in cell["row_nums"]:
698699
for col in cell["column_nums"]:
699-
filled[row, col] = True
700-
# add cells for which filled is false
701-
header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]}
700+
filled.add((row, col))
701+
header_rows = set()
702+
for cell in cells:
703+
if cell["column header"]:
704+
header_rows.update(cell["row_nums"])
705+
706+
# Compose output list directly for speed
702707
new_cells = cells.copy()
703-
not_filled_idx = np.where(filled == False) # noqa: E712
704-
for row, col in zip(not_filled_idx[0], not_filled_idx[1]):
705-
new_cell = {
706-
"row_nums": [row],
707-
"column_nums": [col],
708-
"cell text": "",
709-
"column header": row in header_rows,
710-
}
711-
new_cells.append(new_cell)
708+
for row in range(max_row + 1):
709+
for col in range(max_col + 1):
710+
if (row, col) not in filled:
711+
new_cells.append(
712+
{
713+
"row_nums": [row],
714+
"column_nums": [col],
715+
"cell text": "",
716+
"column header": row in header_rows,
717+
}
718+
)
712719
return new_cells
713720

714721

@@ -727,18 +734,20 @@ def cells_to_html(cells: List[dict]) -> str:
727734
Returns:
728735
str: HTML table string
729736
"""
730-
cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
737+
# Pre-sort with tuple key, as per original
738+
cells_filled = fill_cells(cells)
739+
cells_sorted = sorted(cells_filled, key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
731740

732741
table = ET.Element("table")
733742
current_row = -1
734743

735-
table_header = None
736-
table_has_header = any(cell["column header"] for cell in cells)
737-
if table_has_header:
738-
table_header = ET.SubElement(table, "thead")
739-
744+
# Check if any column header exists
745+
table_has_header = any(cell["column header"] for cell in cells_sorted)
746+
table_header = ET.SubElement(table, "thead") if table_has_header else None
740747
table_body = ET.SubElement(table, "tbody")
741-
for cell in cells:
748+
749+
row = None
750+
for cell in cells_sorted:
742751
this_row = min(cell["row_nums"])
743752
attrib = {}
744753
colspan = len(cell["column_nums"])
@@ -756,8 +765,9 @@ def cells_to_html(cells: List[dict]) -> str:
756765
table_subelement = table_body
757766
cell_tag = "td"
758767
row = ET.SubElement(table_subelement, "tr") # type: ignore
759-
tcell = ET.SubElement(row, cell_tag, attrib=attrib)
760-
tcell.text = cell["cell text"]
768+
if row is not None:
769+
tcell = ET.SubElement(row, cell_tag, attrib=attrib)
770+
tcell.text = cell["cell text"]
761771

762772
return str(ET.tostring(table, encoding="unicode", short_empty_elements=False))
763773

0 commit comments

Comments
 (0)