Skip to content

Commit 2371c11

Browse files
authored
feat: add kv_item support for doctag to docling_document (#188)
* add kv_item support for doctag to docling_document Signed-off-by: Saidgurbuz <[email protected]> * use resize_by_scale to save locations Signed-off-by: Saidgurbuz <[email protected]> * add kv region to tag_to_doclabel Signed-off-by: Saidgurbuz <[email protected]> * add test for doctags_load_for_kv_region Signed-off-by: Saidgurbuz <[email protected]> * update the naming to .dt for consistency Signed-off-by: Saidgurbuz <[email protected]> --------- Signed-off-by: Saidgurbuz <[email protected]>
1 parent 89fce70 commit 2371c11

13 files changed

+118
-6
lines changed

docling_core/types/doc/document.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3037,6 +3037,7 @@ def load_from_doctags( # noqa: C901
30373037
"list_item": DocItemLabel.LIST_ITEM,
30383038
"footnote": DocItemLabel.FOOTNOTE,
30393039
"code": DocItemLabel.CODE,
3040+
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
30403041
}
30413042

30423043
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
@@ -3228,6 +3229,95 @@ def parse_table_content(otsl_content: str) -> TableData:
32283229
table_cells=table_cells,
32293230
)
32303231

3232+
def parse_key_value_item(
3233+
tokens: str, image: Optional[PILImage.Image] = None
3234+
) -> Tuple[GraphData, Optional[ProvenanceItem]]:
3235+
if image is not None:
3236+
pg_width = image.width
3237+
pg_height = image.height
3238+
else:
3239+
pg_width = 1
3240+
pg_height = 1
3241+
3242+
start_locs_match = re.search(r"<key_value_region>(.*?)<key", tokens)
3243+
if start_locs_match:
3244+
overall_locs = start_locs_match.group(1)
3245+
overall_bbox = extract_bounding_box(overall_locs) if image else None
3246+
overall_prov = (
3247+
ProvenanceItem(
3248+
bbox=overall_bbox.resize_by_scale(pg_width, pg_height),
3249+
charspan=(0, 0),
3250+
page_no=1,
3251+
)
3252+
if overall_bbox
3253+
else None
3254+
)
3255+
else:
3256+
overall_prov = None
3257+
3258+
# here we assumed the labels as only key or value, later on we can update
3259+
# it to have unspecified, checkbox etc.
3260+
cell_pattern = re.compile(
3261+
r"<(?P<label>key|value)_(?P<id>\d+)>"
3262+
r"(?P<content>.*?)"
3263+
r"</(?P=label)_(?P=id)>",
3264+
re.DOTALL,
3265+
)
3266+
3267+
cells: List["GraphCell"] = []
3268+
links: List["GraphLink"] = []
3269+
raw_link_predictions = []
3270+
3271+
for cell_match in cell_pattern.finditer(tokens):
3272+
cell_label_str = cell_match.group("label") # "key" or "value"
3273+
cell_id = int(cell_match.group("id"))
3274+
raw_content = cell_match.group("content")
3275+
3276+
# link tokens
3277+
link_matches = re.findall(r"<link_(\d+)>", raw_content)
3278+
3279+
cell_bbox = extract_bounding_box(raw_content) if image else None
3280+
cell_prov = None
3281+
if cell_bbox is not None:
3282+
cell_prov = ProvenanceItem(
3283+
bbox=cell_bbox.resize_by_scale(pg_width, pg_height),
3284+
charspan=(0, 0),
3285+
page_no=1,
3286+
)
3287+
3288+
cleaned_text = re.sub(r"<loc_\d+>", "", raw_content)
3289+
cleaned_text = re.sub(r"<link_\d+>", "", cleaned_text).strip()
3290+
3291+
cell_obj = GraphCell(
3292+
label=GraphCellLabel(cell_label_str),
3293+
cell_id=cell_id,
3294+
text=cleaned_text,
3295+
orig=cleaned_text,
3296+
prov=cell_prov,
3297+
item_ref=None,
3298+
)
3299+
cells.append(cell_obj)
3300+
3301+
cell_ids = {cell.cell_id for cell in cells}
3302+
3303+
for target_str in link_matches:
3304+
raw_link_predictions.append((cell_id, int(target_str)))
3305+
3306+
cell_ids = {cell.cell_id for cell in cells}
3307+
3308+
for source_id, target_id in raw_link_predictions:
3309+
# basic check to validate the prediction
3310+
if target_id not in cell_ids:
3311+
continue
3312+
link_obj = GraphLink(
3313+
label=GraphLinkLabel.TO_VALUE,
3314+
source_cell_id=source_id,
3315+
target_cell_id=target_id,
3316+
)
3317+
links.append(link_obj)
3318+
3319+
return (GraphData(cells=cells, links=links), overall_prov)
3320+
32313321
# doc = DoclingDocument(name="Document")
32323322
for pg_idx, doctag_page in enumerate(doctag_document.pages):
32333323
page_doctags = doctag_page.tokens
@@ -3243,6 +3333,12 @@ def parse_table_content(otsl_content: str) -> TableData:
32433333
pg_width = 1
32443334
pg_height = 1
32453335

3336+
self.add_page(
3337+
page_no=page_no,
3338+
size=Size(width=pg_width, height=pg_height),
3339+
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
3340+
)
3341+
32463342
"""
32473343
1. Finds all <tag>...</tag>
32483344
blocks in the entire string (multi-line friendly)
@@ -3263,6 +3359,7 @@ def parse_table_content(otsl_content: str) -> TableData:
32633359
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
32643360
rf"{DocumentToken.ORDERED_LIST.value}|"
32653361
rf"{DocumentToken.UNORDERED_LIST.value}|"
3362+
rf"{DocItemLabel.KEY_VALUE_REGION}|"
32663363
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
32673364
)
32683365

@@ -3348,6 +3445,11 @@ def parse_table_content(otsl_content: str) -> TableData:
33483445
parent=None,
33493446
)
33503447
pic.captions.append(caption_item.get_ref())
3448+
elif tag_name == DocItemLabel.KEY_VALUE_REGION:
3449+
key_value_data, kv_item_prov = parse_key_value_item(
3450+
full_chunk, image
3451+
)
3452+
self.add_key_values(graph=key_value_data, prov=kv_item_prov)
33513453
elif tag_name in [
33523454
DocumentToken.ORDERED_LIST.value,
33533455
DocumentToken.UNORDERED_LIST.value,

test/data/doc/doc_with_kv.dt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<doctag><key_value_region><loc_30><loc_50><loc_434><loc_444><key_0><loc_31><loc_51><loc_49><loc_60>TO:<link_4></key_0><key_1><loc_31><loc_70><loc_64><loc_80>FROM:<link_5></key_1><key_2><loc_453><loc_400><loc_469><loc_456>8623474</key_2><value_3><loc_408><loc_69><loc_423><loc_78>☑</value_3><value_4><loc_82><loc_51><loc_162><loc_61>Mrs. K. A. Sparrow</value_4><value_5><loc_84><loc_69><loc_130><loc_79>R. G. Ryan</value_5><value_6><loc_339><loc_70><loc_371><loc_78>JUNE7<link_3></value_6><value_7><loc_338><loc_78><loc_373><loc_87>AUG.2</value_7><value_8><loc_339><loc_88><loc_372><loc_96>OCT.7</value_8><key_9><loc_344><loc_50><loc_434><loc_60>SUBMISSION DATE:<link_6><link_8><link_7></key_9><key_10><loc_112><loc_106><loc_361><loc_117>NEWPORT LIGHTS HEAVY UP PROGRESS REPORT</key_10><key_11><loc_31><loc_134><loc_276><loc_144>EFFECTIVENESS OF DISTRIBUTION ALLOWANCE:<link_16><link_14><link_12></key_11><value_12><loc_30><loc_154><loc_190><loc_164>DIRECT ACCOUNT/ WHOLESALERS:<link_13></value_12><value_13><loc_32><loc_164><loc_397><loc_182>Distribution allowance was very effective in accomplishing our objectives. All accounts have purchased introductory products.</value_13><value_14><loc_31><loc_218><loc_156><loc_227>DIRECT ACCOUNT CHAINS:<link_15></value_14><value_15><loc_31><loc_228><loc_156><loc_238>Eagle Foods is the only Void.</value_15><value_16><loc_31><loc_276><loc_186><loc_285>NON- DIRECT ACCOUNT CHAINS:<link_17></value_16><value_17><loc_31><loc_286><loc_381><loc_295>Reception from these accounts is most positive with a solid incentitive to purchase.</value_17><key_18><loc_31><loc_331><loc_161><loc_360>EFFECTIVENESS OF THE RETAIL (1 00 OFF CARTON) DISTRIBUTION ALLOWANCE:<link_19></key_18><value_19><loc_185><loc_350><loc_429><loc_370>Has been most helpful in acquiring desireable distribution when needed by Sales Reps.</value_19><key_20><loc_31><loc_398><loc_155><loc_408>PROMOTIONAL ACTIVITY<link_21></key_20><value_21><loc_31><loc_417><loc_120><loc_436>40c OFF PACK- GENERAL MARKET:<link_22></value_21><value_22><loc_135><loc_426><loc_401><loc_444>The 40c off promotions continue to be well received at the retail stores and by consumers, as well.</value_22></key_value_region></doctag>

test/data/doc/doc_with_kv.png

119 KB
Loading
File renamed without changes.

0 commit comments

Comments
 (0)