Skip to content

Commit 3b99879

Browse files
feat: add improved table serializer and visualizer (#328)
* feat: add improved table serializer and visualizer Signed-off-by: Peter Staar <[email protected]> * visualizer for rows and columns Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 375fdfb commit 3b99879

File tree

9 files changed

+250
-16
lines changed

9 files changed

+250
-16
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ max-line-length = 120
44
exclude = test/*
55
max-complexity = 25
66
docstring-convention = google
7-
ignore = W503,E203
7+
ignore = W503,E203,E741
88
classmethod-decorators = classmethod,validator

docling_core/transforms/serializer/html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def serialize(
340340

341341
content = html.escape(cell.text.strip())
342342
celltag = "td"
343-
if cell.column_header:
343+
if cell.column_header or cell.row_header or cell.row_section:
344344
celltag = "th"
345345

346346
opening_tag = f"{celltag}"

docling_core/transforms/visualizer/table_visualizer.py

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,23 @@ class Params(BaseModel):
2323

2424
# show_Label: bool = False
2525
show_cells: bool = True
26-
# show_rows: bool = False
27-
# show_cols: bool = False
26+
show_rows: bool = False
27+
show_cols: bool = False
28+
29+
cell_color: tuple[int, int, int, int] = (256, 0, 0, 32)
30+
cell_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
31+
32+
row_color: tuple[int, int, int, int] = (256, 0, 0, 32)
33+
row_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
34+
35+
row_header_color: tuple[int, int, int, int] = (0, 256, 0, 32)
36+
row_header_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
37+
38+
col_color: tuple[int, int, int, int] = (0, 256, 0, 32)
39+
col_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
40+
41+
col_header_color: tuple[int, int, int, int] = (0, 0, 256, 32)
42+
col_header_outline: tuple[int, int, int, int] = (0, 0, 256, 128)
2843

2944
base_visualizer: Optional[BaseVisualizer] = None
3045
params: Params = Params()
@@ -45,7 +60,21 @@ def _draw_table_cells(
4560

4661
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
4762

48-
cell_color = (256, 0, 0, 32) # Transparent black for cells
63+
cell_color = self.params.cell_color # Transparent black for cells
64+
cell_outline = self.params.cell_outline
65+
if cell.column_header:
66+
cell_color = (
67+
self.params.col_header_color
68+
) # Transparent black for cells
69+
cell_outline = self.params.col_header_outline
70+
if cell.row_header:
71+
cell_color = (
72+
self.params.row_header_color
73+
) # Transparent black for cells
74+
cell_outline = self.params.row_header_outline
75+
if cell.row_section:
76+
cell_color = self.params.row_header_color
77+
cell_outline = self.params.row_header_outline
4978

5079
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
5180
cx0 *= scale_x
@@ -55,10 +84,68 @@ def _draw_table_cells(
5584

5685
draw.rectangle(
5786
[(cx0, cy0), (cx1, cy1)],
58-
outline=(256, 0, 0, 128),
87+
outline=cell_outline,
5988
fill=cell_color,
6089
)
6190

91+
def _draw_table_rows(
92+
self,
93+
table: TableItem,
94+
page_image: Image,
95+
page_height: float,
96+
scale_x: float,
97+
scale_y: float,
98+
):
99+
"""Draw individual table cells."""
100+
draw = ImageDraw.Draw(page_image, "RGBA")
101+
102+
rows = table.data.get_row_bounding_boxes()
103+
104+
for rid, bbox in rows.items():
105+
106+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
107+
108+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
109+
cx0 *= scale_x
110+
cx1 *= scale_x
111+
cy0 *= scale_y
112+
cy1 *= scale_y
113+
114+
draw.rectangle(
115+
[(cx0, cy0), (cx1, cy1)],
116+
outline=self.params.row_outline,
117+
fill=self.params.row_color,
118+
)
119+
120+
def _draw_table_cols(
121+
self,
122+
table: TableItem,
123+
page_image: Image,
124+
page_height: float,
125+
scale_x: float,
126+
scale_y: float,
127+
):
128+
"""Draw individual table cells."""
129+
draw = ImageDraw.Draw(page_image, "RGBA")
130+
131+
cols = table.data.get_column_bounding_boxes()
132+
133+
for cid, bbox in cols.items():
134+
135+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
136+
137+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
138+
cx0 *= scale_x
139+
cx1 *= scale_x
140+
cy0 *= scale_y
141+
cy1 *= scale_y
142+
143+
draw.rectangle(
144+
[(cx0, cy0), (cx1, cy1)],
145+
outline=self.params.col_outline,
146+
fill=self.params.col_color,
147+
)
148+
62149
def _draw_doc_tables(
63150
self,
64151
doc: DoclingDocument,
@@ -108,6 +195,24 @@ def _draw_doc_tables(
108195
scale_y=image.height / doc.pages[page_nr].size.height,
109196
)
110197

198+
if self.params.show_rows:
199+
self._draw_table_rows(
200+
table=elem,
201+
page_height=doc.pages[page_nr].size.height,
202+
page_image=image,
203+
scale_x=image.width / doc.pages[page_nr].size.width,
204+
scale_y=image.height / doc.pages[page_nr].size.height,
205+
)
206+
207+
if self.params.show_cols:
208+
self._draw_table_cols(
209+
table=elem,
210+
page_height=doc.pages[page_nr].size.height,
211+
page_image=image,
212+
scale_x=image.width / doc.pages[page_nr].size.width,
213+
scale_y=image.height / doc.pages[page_nr].size.height,
214+
)
215+
111216
else:
112217
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
113218

docling_core/types/doc/document.py

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from docling_core.search.package import VERSION_PATTERN
3939
from docling_core.types.base import _JSON_POINTER_REGEX
4040
from docling_core.types.doc import BoundingBox, Size
41-
from docling_core.types.doc.base import ImageRefMode
41+
from docling_core.types.doc.base import CoordOrigin, ImageRefMode
4242
from docling_core.types.doc.labels import (
4343
CodeLanguageLabel,
4444
DocItemLabel,
@@ -372,6 +372,119 @@ def grid(
372372

373373
return table_data
374374

375+
def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
376+
"""Get the minimal bounding box for each row in the table.
377+
378+
Returns:
379+
List[Optional[BoundingBox]]: A list where each element is the minimal
380+
bounding box that encompasses all cells in that row, or None if no
381+
cells in the row have bounding boxes.
382+
"""
383+
coords = []
384+
for cell in self.table_cells:
385+
if cell.bbox is not None:
386+
coords.append(cell.bbox.coord_origin)
387+
388+
if len(set(coords)) > 1:
389+
raise ValueError(
390+
"All bounding boxes must have the same \
391+
CoordOrigin to compute their union."
392+
)
393+
394+
row_bboxes: dict[int, BoundingBox] = {}
395+
396+
for row_idx in range(self.num_rows):
397+
row_cells_with_bbox: dict[int, list[BoundingBox]] = {}
398+
399+
# Collect all cells in this row that have bounding boxes
400+
for cell in self.table_cells:
401+
402+
if (
403+
cell.bbox is not None
404+
and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx
405+
):
406+
407+
row_span = cell.end_row_offset_idx - cell.start_row_offset_idx
408+
if row_span in row_cells_with_bbox:
409+
row_cells_with_bbox[row_span].append(cell.bbox)
410+
else:
411+
row_cells_with_bbox[row_span] = [cell.bbox]
412+
413+
# Calculate the enclosing bounding box for this row
414+
if len(row_cells_with_bbox) > 0:
415+
min_row_span = min(row_cells_with_bbox.keys())
416+
row_bbox: BoundingBox = BoundingBox.enclosing_bbox(
417+
row_cells_with_bbox[min_row_span]
418+
)
419+
420+
for rspan, bboxs in row_cells_with_bbox.items():
421+
for bbox in bboxs:
422+
row_bbox.l = min(row_bbox.l, bbox.l)
423+
row_bbox.r = max(row_bbox.r, bbox.r)
424+
425+
row_bboxes[row_idx] = row_bbox
426+
427+
return row_bboxes
428+
429+
def get_column_bounding_boxes(self) -> dict[int, BoundingBox]:
430+
"""Get the minimal bounding box for each column in the table.
431+
432+
Returns:
433+
List[Optional[BoundingBox]]: A list where each element is the minimal
434+
bounding box that encompasses all cells in that column, or None if no
435+
cells in the column have bounding boxes.
436+
"""
437+
coords = []
438+
for cell in self.table_cells:
439+
if cell.bbox is not None:
440+
coords.append(cell.bbox.coord_origin)
441+
442+
if len(set(coords)) > 1:
443+
raise ValueError(
444+
"All bounding boxes must have the same \
445+
CoordOrigin to compute their union."
446+
)
447+
448+
col_bboxes: dict[int, BoundingBox] = {}
449+
450+
for col_idx in range(self.num_cols):
451+
col_cells_with_bbox: dict[int, list[BoundingBox]] = {}
452+
453+
# Collect all cells in this row that have bounding boxes
454+
for cell in self.table_cells:
455+
456+
if (
457+
cell.bbox is not None
458+
and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx
459+
):
460+
461+
col_span = cell.end_col_offset_idx - cell.start_col_offset_idx
462+
if col_span in col_cells_with_bbox:
463+
col_cells_with_bbox[col_span].append(cell.bbox)
464+
else:
465+
col_cells_with_bbox[col_span] = [cell.bbox]
466+
467+
# Calculate the enclosing bounding box for this row
468+
if len(col_cells_with_bbox) > 0:
469+
min_col_span = min(col_cells_with_bbox.keys())
470+
col_bbox: BoundingBox = BoundingBox.enclosing_bbox(
471+
col_cells_with_bbox[min_col_span]
472+
)
473+
474+
for rspan, bboxs in col_cells_with_bbox.items():
475+
for bbox in bboxs:
476+
if bbox.coord_origin == CoordOrigin.TOPLEFT:
477+
col_bbox.b = max(col_bbox.b, bbox.b)
478+
col_bbox.t = min(col_bbox.t, bbox.t)
479+
480+
elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
481+
col_bbox.b = min(col_bbox.b, bbox.b)
482+
col_bbox.t = max(col_bbox.t, bbox.t)
483+
484+
col_bboxes[col_idx] = col_bbox
485+
486+
return col_bboxes
487+
375488

376489
class PictureTabularChartData(PictureChartData):
377490
"""Base class for picture chart data.

0 commit comments

Comments
 (0)