|
4 | 4 | # |
5 | 5 |
|
6 | 6 | """Define common models across CCS objects.""" |
7 | | -from typing import Annotated, Literal, Optional, Union |
| 7 | +from typing import Annotated, List, Literal, Optional, Union |
8 | 8 |
|
| 9 | +import pandas as pd |
9 | 10 | from pydantic import BaseModel, Field, PositiveInt, StrictStr |
10 | 11 |
|
11 | 12 | from docling_core.search.mapping import es_field |
@@ -152,6 +153,102 @@ class Table(BaseCell): |
152 | 153 | data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None |
153 | 154 | model: Optional[str] = None |
154 | 155 |
|
| 156 | + def _get_tablecell_span(self, cell: TableCell, ix: int): |
| 157 | + if cell.spans is None: |
| 158 | + span = set() |
| 159 | + else: |
| 160 | + span = set([s[ix] for s in cell.spans]) |
| 161 | + if len(span) == 0: |
| 162 | + return 1, None, None |
| 163 | + return len(span), min(span), max(span) |
| 164 | + |
| 165 | + def export_to_dataframe(self) -> pd.DataFrame: |
| 166 | + """Export the table as a Pandas DataFrame.""" |
| 167 | + if self.data is None or self.num_rows == 0 or self.num_cols == 0: |
| 168 | + return pd.DataFrame() |
| 169 | + |
| 170 | + # Count how many rows are column headers |
| 171 | + num_headers = 0 |
| 172 | + for i, row in enumerate(self.data): |
| 173 | + if len(row) == 0: |
| 174 | + raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.") |
| 175 | + |
| 176 | + any_header = False |
| 177 | + for cell in row: |
| 178 | + if cell.obj_type == "col_header": |
| 179 | + any_header = True |
| 180 | + break |
| 181 | + |
| 182 | + if any_header: |
| 183 | + num_headers += 1 |
| 184 | + else: |
| 185 | + break |
| 186 | + |
| 187 | + # Create the column names from all col_headers |
| 188 | + columns: Optional[List[str]] = None |
| 189 | + if num_headers > 0: |
| 190 | + columns = ["" for _ in range(self.num_cols)] |
| 191 | + for i in range(num_headers): |
| 192 | + for j, cell in enumerate(self.data[i]): |
| 193 | + col_name = cell.text |
| 194 | + if columns[j] != "": |
| 195 | + col_name = f".{col_name}" |
| 196 | + columns[j] += col_name |
| 197 | + |
| 198 | + # Create table data |
| 199 | + table_data = [[cell.text for cell in row] for row in self.data[num_headers:]] |
| 200 | + |
| 201 | + # Create DataFrame |
| 202 | + df = pd.DataFrame(table_data, columns=columns) |
| 203 | + |
| 204 | + return df |
| 205 | + |
| 206 | + def export_to_html(self) -> str: |
| 207 | + """Export the table as html.""" |
| 208 | + body = "" |
| 209 | + nrows = self.num_rows |
| 210 | + ncols = self.num_cols |
| 211 | + |
| 212 | + if self.data is None: |
| 213 | + return "" |
| 214 | + for i in range(nrows): |
| 215 | + body += "<tr>" |
| 216 | + for j in range(ncols): |
| 217 | + cell: TableCell = self.data[i][j] |
| 218 | + |
| 219 | + rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0) |
| 220 | + colspan, colstart, colend = self._get_tablecell_span(cell, 1) |
| 221 | + |
| 222 | + if rowstart is not None and rowstart != i: |
| 223 | + continue |
| 224 | + if colstart is not None and colstart != j: |
| 225 | + continue |
| 226 | + |
| 227 | + if rowstart is None: |
| 228 | + rowstart = i |
| 229 | + if colstart is None: |
| 230 | + colstart = j |
| 231 | + |
| 232 | + content = cell.text.strip() |
| 233 | + label = cell.obj_type |
| 234 | + celltag = "td" |
| 235 | + if label in ["row_header", "row_multi_header", "row_title"]: |
| 236 | + pass |
| 237 | + elif label in ["col_header", "col_multi_header"]: |
| 238 | + celltag = "th" |
| 239 | + |
| 240 | + opening_tag = f"{celltag}" |
| 241 | + if rowspan > 1: |
| 242 | + opening_tag += f' rowspan="{rowspan}"' |
| 243 | + if colspan > 1: |
| 244 | + opening_tag += f' colspan="{colspan}"' |
| 245 | + |
| 246 | + body += f"<{opening_tag}>{content}</{celltag}>" |
| 247 | + body += "</tr>" |
| 248 | + body = f"<table>{body}</table>" |
| 249 | + |
| 250 | + return body |
| 251 | + |
155 | 252 |
|
156 | 253 | # FIXME: let's add some figure specific data-types later |
157 | 254 | class Figure(BaseCell): |
|
0 commit comments