Skip to content

Commit db119f4

Browse files
Saidgurbuzvagenas
andauthored
feat: Add document tokens from key value items (#170)
* add export_to_document_tokens method for KeyValueItem Signed-off-by: Saidgurbuz <[email protected]> * fix export_to_document_tokens kv-item Signed-off-by: Saidgurbuz <[email protected]> * update key-link representations in document tokens Signed-off-by: Saidgurbuz <[email protected]> * fix key-value cell location for doctags Signed-off-by: Saidgurbuz <[email protected]> * set default page_no to 1 Signed-off-by: Saidgurbuz <[email protected]> * fix get_location call with to_top_left_origin Signed-off-by: Saidgurbuz <[email protected]> * integrate export_to_document_tokens to doctag serializer Signed-off-by: Saidgurbuz <[email protected]> * Add DocTags serializer dispatching, deprecate new_line param (#212) * updates for key value region Signed-off-by: Panos Vagenas <[email protected]> * deprecate "new_line" parameter Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Saidgurbuz <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> Co-authored-by: Panos Vagenas <[email protected]>
1 parent bc3f5d5 commit db119f4

File tree

5 files changed

+127
-51
lines changed

5 files changed

+127
-51
lines changed

docling_core/experimental/serializer/doctags.py

Lines changed: 73 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import html
44
from enum import Enum
55
from pathlib import Path
6-
from typing import Optional, Union
6+
from typing import Dict, List, Optional, Union
77

88
from pydantic import AnyUrl, BaseModel
99
from typing_extensions import override
@@ -23,6 +23,7 @@
2323
from docling_core.experimental.serializer.common import CommonParams, DocSerializer
2424
from docling_core.types.doc.document import (
2525
CodeItem,
26+
DocItem,
2627
DoclingDocument,
2728
Formatting,
2829
FormItem,
@@ -54,7 +55,6 @@ class Mode(str, Enum):
5455
MINIFIED = "minified"
5556
HUMAN_FRIENDLY = "human_friendly"
5657

57-
new_line: str = ""
5858
xsize: int = 500
5959
ysize: int = 500
6060
add_location: bool = True
@@ -67,13 +67,13 @@ class Mode(str, Enum):
6767
mode: Mode = Mode.HUMAN_FRIENDLY
6868

6969

70-
def _get_delim(mode: DocTagsParams.Mode) -> str:
71-
if mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
70+
def _get_delim(params: DocTagsParams) -> str:
71+
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
7272
delim = "\n"
73-
elif mode == DocTagsParams.Mode.MINIFIED:
73+
elif params.mode == DocTagsParams.Mode.MINIFIED:
7474
delim = ""
7575
else:
76-
raise RuntimeError(f"Unknown DocTags mode: {mode}")
76+
raise RuntimeError(f"Unknown DocTags mode: {params.mode}")
7777
return delim
7878

7979

@@ -102,7 +102,6 @@ def serialize(
102102
if params.add_location:
103103
location = item.get_location_tokens(
104104
doc=doc,
105-
new_line=params.new_line,
106105
xsize=params.xsize,
107106
ysize=params.ysize,
108107
)
@@ -158,7 +157,6 @@ def serialize(
158157
if params.add_location:
159158
body += item.get_location_tokens(
160159
doc=doc,
161-
new_line=params.new_line,
162160
xsize=params.xsize,
163161
ysize=params.ysize,
164162
)
@@ -178,15 +176,14 @@ def serialize(
178176
body += f"<{DocumentToken.CAPTION.value}>"
179177
for caption in item.captions:
180178
if caption.cref not in doc_serializer.get_excluded_refs(**kwargs):
181-
body += caption.resolve(doc).get_location_tokens(
182-
doc=doc,
183-
new_line=params.new_line,
184-
xsize=params.xsize,
185-
ysize=params.ysize,
186-
)
179+
if isinstance(cap := caption.resolve(doc), DocItem):
180+
body += cap.get_location_tokens(
181+
doc=doc,
182+
xsize=params.xsize,
183+
ysize=params.ysize,
184+
)
187185
body += f"{text.strip()}"
188186
body += f"</{DocumentToken.CAPTION.value}>"
189-
body += f"{params.new_line}"
190187

191188
if body:
192189
body = _wrap(text=body, wrap_tag=DocumentToken.OTSL.value)
@@ -208,15 +205,13 @@ def serialize(
208205
) -> SerializationResult:
209206
"""Serializes the passed item."""
210207
params = DocTagsParams(**kwargs)
211-
212208
parts: list[str] = []
213209

214210
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
215211
body = ""
216212
if params.add_location:
217213
body += item.get_location_tokens(
218214
doc=doc,
219-
new_line=params.new_line,
220215
xsize=params.xsize,
221216
ysize=params.ysize,
222217
)
@@ -246,13 +241,13 @@ def serialize(
246241
body = ""
247242
for caption in item.captions:
248243
if caption.cref not in doc_serializer.get_excluded_refs(**kwargs):
249-
body += caption.resolve(doc).get_location_tokens(
250-
doc=doc,
251-
new_line=params.new_line,
252-
xsize=params.xsize,
253-
ysize=params.ysize,
254-
)
255-
body += f"{text.strip()}"
244+
if isinstance(cap := caption.resolve(doc), DocItem):
245+
body += cap.get_location_tokens(
246+
doc=doc,
247+
xsize=params.xsize,
248+
ysize=params.ysize,
249+
)
250+
body += f"{text.strip()}"
256251
if body:
257252
body = _wrap(text=body, wrap_tag=DocumentToken.CAPTION.value)
258253
parts.append(body)
@@ -279,9 +274,56 @@ def serialize(
279274
**kwargs,
280275
) -> SerializationResult:
281276
"""Serializes the passed item."""
282-
# TODO add actual implementation
283-
text_res = ""
284-
return SerializationResult(text=text_res)
277+
params = DocTagsParams(**kwargs)
278+
279+
body = ""
280+
281+
page_no = 1
282+
if len(item.prov) > 0:
283+
page_no = item.prov[0].page_no
284+
285+
if params.add_location:
286+
body += item.get_location_tokens(
287+
doc=doc,
288+
xsize=params.xsize,
289+
ysize=params.ysize,
290+
)
291+
292+
# mapping from source_cell_id to a list of target_cell_ids
293+
source_to_targets: Dict[int, List[int]] = {}
294+
for link in item.graph.links:
295+
source_to_targets.setdefault(link.source_cell_id, []).append(
296+
link.target_cell_id
297+
)
298+
299+
for cell in item.graph.cells:
300+
cell_txt = ""
301+
if cell.prov is not None:
302+
if len(doc.pages.keys()):
303+
page_w, page_h = doc.pages[page_no].size.as_tuple()
304+
cell_txt += DocumentToken.get_location(
305+
bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(),
306+
page_w=page_w,
307+
page_h=page_h,
308+
xsize=params.xsize,
309+
ysize=params.ysize,
310+
)
311+
if params.add_content:
312+
cell_txt += cell.text.strip()
313+
314+
if cell.cell_id in source_to_targets:
315+
targets = source_to_targets[cell.cell_id]
316+
for target in targets:
317+
# TODO centralize token creation
318+
cell_txt += f"<link_{target}>"
319+
320+
# TODO centralize token creation
321+
tok = f"{cell.label.value}_{cell.cell_id}"
322+
cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
323+
body += cell_txt
324+
325+
body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
326+
return SerializationResult(text=body)
285327

286328

287329
class DocTagsFormSerializer(BaseFormSerializer):
@@ -329,7 +371,7 @@ def serialize(
329371
visited=my_visited,
330372
**kwargs,
331373
)
332-
delim = _get_delim(mode=params.mode)
374+
delim = _get_delim(params=params)
333375
if parts:
334376
text_res = delim.join(
335377
[
@@ -374,7 +416,7 @@ def serialize(
374416
**kwargs,
375417
)
376418
wrap_tag = DocumentToken.INLINE.value
377-
delim = _get_delim(mode=params.mode)
419+
delim = _get_delim(params=params)
378420
text_res = delim.join([p.text for p in parts if p.text])
379421
if text_res:
380422
text_res = f"{text_res}{delim}"
@@ -437,14 +479,14 @@ def post_process(
437479
@override
438480
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
439481
"""Serialize a page out of its parts."""
440-
delim = _get_delim(mode=self.params.mode)
482+
delim = _get_delim(params=self.params)
441483
text_res = delim.join([p.text for p in parts])
442484
return SerializationResult(text=text_res)
443485

444486
@override
445487
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
446488
"""Serialize a document out of its pages."""
447-
delim = _get_delim(mode=self.params.mode)
489+
delim = _get_delim(params=self.params)
448490
if self.params.add_page_break:
449491
page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
450492
content = page_sep.join([p.text for p in pages if p.text])

0 commit comments

Comments
 (0)