|
| 1 | +# |
| 2 | +# Copyright IBM Corp. 2024 - 2024 |
| 3 | +# SPDX-License-Identifier: MIT |
| 4 | +# |
| 5 | + |
| 6 | +"""Tokens used in the docling document model.""" |
| 7 | + |
| 8 | +from enum import Enum |
| 9 | +from typing import Annotated, Tuple |
| 10 | + |
| 11 | +from pydantic import Field |
| 12 | + |
| 13 | + |
| 14 | +class TableToken(Enum): |
| 15 | + """Class to represent an LLM friendly representation of a Table.""" |
| 16 | + |
| 17 | + CELL_LABEL_COLUMN_HEADER = "<column_header>" |
| 18 | + CELL_LABEL_ROW_HEADER = "<row_header>" |
| 19 | + CELL_LABEL_SECTION_HEADERE = "<section_header>" |
| 20 | + CELL_LABEL_DATA = "<data>" |
| 21 | + |
| 22 | + OTSL_ECEL = "<ecel>" # empty cell |
| 23 | + OTSL_FCEL = "<fcel>" # cell with content |
| 24 | + OTSL_LCEL = "<lcel>" # left looking cell, |
| 25 | + OTSL_UCEL = "<ucel>" # up looking cell, |
| 26 | + OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell), |
| 27 | + OTSL_NL = "<nl>" # new line, |
| 28 | + OTSL_CHED = "<ched>" # - column header cell, |
| 29 | + OTSL_RHED = "<rhed>" # - row header cell, |
| 30 | + OTSL_SROW = "<srow>" # - section row cell |
| 31 | + |
| 32 | + @classmethod |
| 33 | + def get_special_tokens(cls): |
| 34 | + """Function to get all special document tokens.""" |
| 35 | + special_tokens = [token.value for token in cls] |
| 36 | + return special_tokens |
| 37 | + |
| 38 | + @staticmethod |
| 39 | + def is_known_token(label): |
| 40 | + """Function to check if label is in tokens.""" |
| 41 | + return label in TableToken.get_special_tokens() |
| 42 | + |
| 43 | + |
| 44 | +class DocumentToken(Enum): |
| 45 | + """Class to represent an LLM friendly representation of a Document.""" |
| 46 | + |
| 47 | + BEG_DOCUMENT = "<document>" |
| 48 | + END_DOCUMENT = "</document>" |
| 49 | + |
| 50 | + BEG_TITLE = "<title>" |
| 51 | + END_TITLE = "</title>" |
| 52 | + |
| 53 | + BEG_ABSTRACT = "<abstract>" |
| 54 | + END_ABSTRACT = "</abstract>" |
| 55 | + |
| 56 | + BEG_DOI = "<doi>" |
| 57 | + END_DOI = "</doi>" |
| 58 | + BEG_DATE = "<date>" |
| 59 | + END_DATE = "</date>" |
| 60 | + |
| 61 | + BEG_AUTHORS = "<authors>" |
| 62 | + END_AUTHORS = "</authors>" |
| 63 | + BEG_AUTHOR = "<author>" |
| 64 | + END_AUTHOR = "</author>" |
| 65 | + |
| 66 | + BEG_AFFILIATIONS = "<affiliations>" |
| 67 | + END_AFFILIATIONS = "</affiliations>" |
| 68 | + BEG_AFFILIATION = "<affiliation>" |
| 69 | + END_AFFILIATION = "</affiliation>" |
| 70 | + |
| 71 | + BEG_HEADER = "<section-header>" |
| 72 | + END_HEADER = "</section-header>" |
| 73 | + BEG_TEXT = "<text>" |
| 74 | + END_TEXT = "</text>" |
| 75 | + BEG_PARAGRAPH = "<paragraph>" |
| 76 | + END_PARAGRAPH = "</paragraph>" |
| 77 | + BEG_TABLE = "<table>" |
| 78 | + END_TABLE = "</table>" |
| 79 | + BEG_FIGURE = "<figure>" |
| 80 | + END_FIGURE = "</figure>" |
| 81 | + BEG_CAPTION = "<caption>" |
| 82 | + END_CAPTION = "</caption>" |
| 83 | + BEG_EQUATION = "<equation>" |
| 84 | + END_EQUATION = "</equation>" |
| 85 | + BEG_LIST = "<list>" |
| 86 | + END_LIST = "</list>" |
| 87 | + BEG_LISTITEM = "<list-item>" |
| 88 | + END_LISTITEM = "</list-item>" |
| 89 | + |
| 90 | + BEG_LOCATION = "<location>" |
| 91 | + END_LOCATION = "</location>" |
| 92 | + BEG_GROUP = "<group>" |
| 93 | + END_GROUP = "</group>" |
| 94 | + |
| 95 | + @classmethod |
| 96 | + def get_special_tokens( |
| 97 | + cls, |
| 98 | + max_rows: int = 100, |
| 99 | + max_cols: int = 100, |
| 100 | + max_pages: int = 1000, |
| 101 | + page_dimension: Tuple[int, int] = (100, 100), |
| 102 | + ): |
| 103 | + """Function to get all special document tokens.""" |
| 104 | + special_tokens = [token.value for token in cls] |
| 105 | + |
| 106 | + # Adding dynamically generated row and col tokens |
| 107 | + for i in range(0, max_rows + 1): |
| 108 | + special_tokens += [f"<row_{i}>", f"</row_{i}>"] |
| 109 | + |
| 110 | + for i in range(0, max_cols + 1): |
| 111 | + special_tokens += [f"<col_{i}>", f"</col_{i}>"] |
| 112 | + |
| 113 | + for i in range(6): |
| 114 | + special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"] |
| 115 | + |
| 116 | + # FIXME: this is synonym of section header |
| 117 | + for i in range(6): |
| 118 | + special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"] |
| 119 | + |
| 120 | + # Adding dynamically generated page-tokens |
| 121 | + for i in range(0, max_pages + 1): |
| 122 | + special_tokens.append(f"<page_{i}>") |
| 123 | + special_tokens.append(f"</page_{i}>") |
| 124 | + |
| 125 | + # Adding dynamically generated location-tokens |
| 126 | + for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)): |
| 127 | + special_tokens.append(f"<loc_{i}>") |
| 128 | + |
| 129 | + return special_tokens |
| 130 | + |
| 131 | + @staticmethod |
| 132 | + def is_known_token(label): |
| 133 | + """Function to check if label is in tokens.""" |
| 134 | + return label in DocumentToken.get_special_tokens() |
| 135 | + |
| 136 | + @staticmethod |
| 137 | + def get_row_token(row: int, beg=bool) -> str: |
| 138 | + """Function to get page tokens.""" |
| 139 | + if beg: |
| 140 | + return f"<row_{row}>" |
| 141 | + else: |
| 142 | + return f"</row_{row}>" |
| 143 | + |
| 144 | + @staticmethod |
| 145 | + def get_col_token(col: int, beg=bool) -> str: |
| 146 | + """Function to get page tokens.""" |
| 147 | + if beg: |
| 148 | + return f"<col_{col}>" |
| 149 | + else: |
| 150 | + return f"</col_{col}>" |
| 151 | + |
| 152 | + @staticmethod |
| 153 | + def get_page_token(page: int): |
| 154 | + """Function to get page tokens.""" |
| 155 | + return f"<page_{page}>" |
| 156 | + |
| 157 | + @staticmethod |
| 158 | + def get_location_token(val: float, rnorm: int = 100): |
| 159 | + """Function to get location tokens.""" |
| 160 | + val_ = round(rnorm * val) |
| 161 | + |
| 162 | + if val_ < 0: |
| 163 | + return "<loc_0>" |
| 164 | + |
| 165 | + if val_ > rnorm: |
| 166 | + return f"<loc_{rnorm}>" |
| 167 | + |
| 168 | + return f"<loc_{val_}>" |
| 169 | + |
| 170 | + @staticmethod |
| 171 | + def get_location( |
| 172 | + # bbox: Tuple[float, float, float, float], |
| 173 | + bbox: Annotated[list[float], Field(min_length=4, max_length=4)], |
| 174 | + page_w: float, |
| 175 | + page_h: float, |
| 176 | + xsize: int = 100, |
| 177 | + ysize: int = 100, |
| 178 | + page_i: int = -1, |
| 179 | + ): |
| 180 | + """Get the location string give bbox and page-dim.""" |
| 181 | + assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}" |
| 182 | + assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}" |
| 183 | + |
| 184 | + x0 = bbox[0] / page_w |
| 185 | + y0 = bbox[1] / page_h |
| 186 | + x1 = bbox[2] / page_w |
| 187 | + y1 = bbox[3] / page_h |
| 188 | + |
| 189 | + page_tok = "" |
| 190 | + if page_i != -1: |
| 191 | + page_tok = DocumentToken.get_page_token(page=page_i) |
| 192 | + |
| 193 | + x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize) |
| 194 | + y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize) |
| 195 | + x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize) |
| 196 | + y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize) |
| 197 | + |
| 198 | + loc_str = f"{DocumentToken.BEG_LOCATION.value}" |
| 199 | + loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}" |
| 200 | + loc_str += f"{DocumentToken.END_LOCATION.value}" |
| 201 | + |
| 202 | + return loc_str |
0 commit comments