Skip to content

Commit 2cc2429

Browse files
authored
feat: add table exporters (#20)
Signed-off-by: Michele Dolfi <[email protected]>
1 parent 1ed846c commit 2cc2429

File tree

1 file changed

+98
-1
lines changed

1 file changed

+98
-1
lines changed

docling_core/types/doc/base.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
#
55

66
"""Define common models across CCS objects."""
7-
from typing import Annotated, Literal, Optional, Union
7+
from typing import Annotated, List, Literal, Optional, Union
88

9+
import pandas as pd
910
from pydantic import BaseModel, Field, PositiveInt, StrictStr
1011

1112
from docling_core.search.mapping import es_field
@@ -152,6 +153,102 @@ class Table(BaseCell):
152153
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
153154
model: Optional[str] = None
154155

156+
def _get_tablecell_span(self, cell: TableCell, ix: int):
157+
if cell.spans is None:
158+
span = set()
159+
else:
160+
span = set([s[ix] for s in cell.spans])
161+
if len(span) == 0:
162+
return 1, None, None
163+
return len(span), min(span), max(span)
164+
165+
def export_to_dataframe(self) -> pd.DataFrame:
166+
"""Export the table as a Pandas DataFrame."""
167+
if self.data is None or self.num_rows == 0 or self.num_cols == 0:
168+
return pd.DataFrame()
169+
170+
# Count how many rows are column headers
171+
num_headers = 0
172+
for i, row in enumerate(self.data):
173+
if len(row) == 0:
174+
raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
175+
176+
any_header = False
177+
for cell in row:
178+
if cell.obj_type == "col_header":
179+
any_header = True
180+
break
181+
182+
if any_header:
183+
num_headers += 1
184+
else:
185+
break
186+
187+
# Create the column names from all col_headers
188+
columns: Optional[List[str]] = None
189+
if num_headers > 0:
190+
columns = ["" for _ in range(self.num_cols)]
191+
for i in range(num_headers):
192+
for j, cell in enumerate(self.data[i]):
193+
col_name = cell.text
194+
if columns[j] != "":
195+
col_name = f".{col_name}"
196+
columns[j] += col_name
197+
198+
# Create table data
199+
table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
200+
201+
# Create DataFrame
202+
df = pd.DataFrame(table_data, columns=columns)
203+
204+
return df
205+
206+
def export_to_html(self) -> str:
207+
"""Export the table as html."""
208+
body = ""
209+
nrows = self.num_rows
210+
ncols = self.num_cols
211+
212+
if self.data is None:
213+
return ""
214+
for i in range(nrows):
215+
body += "<tr>"
216+
for j in range(ncols):
217+
cell: TableCell = self.data[i][j]
218+
219+
rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
220+
colspan, colstart, colend = self._get_tablecell_span(cell, 1)
221+
222+
if rowstart is not None and rowstart != i:
223+
continue
224+
if colstart is not None and colstart != j:
225+
continue
226+
227+
if rowstart is None:
228+
rowstart = i
229+
if colstart is None:
230+
colstart = j
231+
232+
content = cell.text.strip()
233+
label = cell.obj_type
234+
celltag = "td"
235+
if label in ["row_header", "row_multi_header", "row_title"]:
236+
pass
237+
elif label in ["col_header", "col_multi_header"]:
238+
celltag = "th"
239+
240+
opening_tag = f"{celltag}"
241+
if rowspan > 1:
242+
opening_tag += f' rowspan="{rowspan}"'
243+
if colspan > 1:
244+
opening_tag += f' colspan="{colspan}"'
245+
246+
body += f"<{opening_tag}>{content}</{celltag}>"
247+
body += "</tr>"
248+
body = f"<table>{body}</table>"
249+
250+
return body
251+
155252

156253
# FIXME: let's add some figure specific data-types later
157254
class Figure(BaseCell):

0 commit comments

Comments
 (0)