Skip to content

Commit 5e4c0fd

Browse files
authored
feat: integrate serialization API into chunkers (#221)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 73b9941 commit 5e4c0fd

File tree

15 files changed

+95572
-18936
lines changed

15 files changed

+95572
-18936
lines changed

docling_core/experimental/serializer/base.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pydantic import AnyUrl, BaseModel
1212

1313
from docling_core.types.doc.document import (
14+
DocItem,
1415
DoclingDocument,
1516
FloatingItem,
1617
FormItem,
@@ -25,10 +26,19 @@
2526
)
2627

2728

29+
class Span(BaseModel):
30+
"""Class encapsulating fine-granular document span information."""
31+
32+
item: DocItem
33+
# prov_idx: Optional[PositiveInt] = None # None to be interpreted as whole DocItem
34+
35+
2836
class SerializationResult(BaseModel):
2937
"""SerializationResult."""
3038

31-
text: str
39+
text: str = ""
40+
spans: list[Span] = []
41+
# group: Optional[GroupItem] = None # set when result reflects specific group item
3242

3343

3444
class BaseTextSerializer(ABC):
@@ -163,7 +173,9 @@ class BaseDocSerializer(ABC):
163173
"""Base class for document serializers."""
164174

165175
@abstractmethod
166-
def serialize(self, **kwargs) -> SerializationResult:
176+
def serialize(
177+
self, *, item: Optional[NodeItem] = None, **kwargs
178+
) -> SerializationResult:
167179
"""Run the serialization."""
168180
...
169181

docling_core/experimental/serializer/common.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
BaseTableSerializer,
2626
BaseTextSerializer,
2727
SerializationResult,
28+
Span,
2829
)
2930
from docling_core.types.doc.document import (
3031
DOCUMENT_TOKENS_EXPORT_LABELS,
@@ -49,6 +50,36 @@
4950
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
5051

5152

53+
def create_ser_result(
54+
*,
55+
text: str = "",
56+
span_source: Union[DocItem, list[SerializationResult]] = [],
57+
) -> SerializationResult:
58+
"""Function for creating `SerializationResult` instances.
59+
60+
Args:
61+
text: the text the use. Defaults to "".
62+
span_source: the item or list of results to use as span source. Defaults to [].
63+
64+
Returns:
65+
The created `SerializationResult`.
66+
"""
67+
spans: list[Span]
68+
if isinstance(span_source, DocItem):
69+
spans = [Span(item=span_source)]
70+
else:
71+
results: list[SerializationResult] = span_source
72+
spans = []
73+
for ser_res in results:
74+
for span in ser_res.spans:
75+
if span not in spans:
76+
spans.append(span)
77+
return SerializationResult(
78+
text=text,
79+
spans=spans,
80+
)
81+
82+
5283
class CommonParams(BaseModel):
5384
"""Common serialization parameters."""
5485

@@ -217,14 +248,17 @@ def serialize(
217248
) -> SerializationResult:
218249
"""Serialize a given node."""
219250
my_visited: set[str] = visited if visited is not None else set()
220-
empty_res = SerializationResult(text="")
251+
my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
252+
empty_res = create_ser_result()
221253
if item is None or item == self.doc.body:
222254
if self.doc.body.self_ref not in my_visited:
223255
my_visited.add(self.doc.body.self_ref)
224256
return self._serialize_body()
225257
else:
226258
return empty_res
227259

260+
my_visited.add(item.self_ref)
261+
228262
########
229263
# groups
230264
########
@@ -236,7 +270,7 @@ def serialize(
236270
list_level=list_level,
237271
is_inline_scope=is_inline_scope,
238272
visited=my_visited,
239-
**kwargs,
273+
**my_kwargs,
240274
)
241275
elif isinstance(item, InlineGroup):
242276
part = self.inline_serializer.serialize(
@@ -245,7 +279,7 @@ def serialize(
245279
doc=self.doc,
246280
list_level=list_level,
247281
visited=my_visited,
248-
**kwargs,
282+
**my_kwargs,
249283
)
250284
###########
251285
# doc items
@@ -261,7 +295,7 @@ def serialize(
261295
doc_serializer=self,
262296
doc=self.doc,
263297
is_inline_scope=is_inline_scope,
264-
**kwargs,
298+
**my_kwargs,
265299
)
266300
if item.self_ref not in self.get_excluded_refs(**kwargs)
267301
else empty_res
@@ -271,36 +305,36 @@ def serialize(
271305
item=item,
272306
doc_serializer=self,
273307
doc=self.doc,
274-
**kwargs,
308+
**my_kwargs,
275309
)
276310
elif isinstance(item, PictureItem):
277311
part = self.picture_serializer.serialize(
278312
item=item,
279313
doc_serializer=self,
280314
doc=self.doc,
281315
visited=my_visited,
282-
**kwargs,
316+
**my_kwargs,
283317
)
284318
elif isinstance(item, KeyValueItem):
285319
part = self.key_value_serializer.serialize(
286320
item=item,
287321
doc_serializer=self,
288322
doc=self.doc,
289-
**kwargs,
323+
**my_kwargs,
290324
)
291325
elif isinstance(item, FormItem):
292326
part = self.form_serializer.serialize(
293327
item=item,
294328
doc_serializer=self,
295329
doc=self.doc,
296-
**kwargs,
330+
**my_kwargs,
297331
)
298332
else:
299333
part = self.fallback_serializer.serialize(
300334
item=item,
301335
doc_serializer=self,
302336
doc=self.doc,
303-
**kwargs,
337+
**my_kwargs,
304338
)
305339
return part
306340

@@ -401,15 +435,16 @@ def serialize_captions(
401435
) -> SerializationResult:
402436
"""Serialize the item's captions."""
403437
params = self.params.merge_with_patch(patch=kwargs)
438+
results: list[SerializationResult] = []
404439
if DocItemLabel.CAPTION in params.labels:
405-
text_parts: list[str] = [
406-
it.text
440+
results = [
441+
create_ser_result(text=it.text, span_source=it)
407442
for cap in item.captions
408443
if isinstance(it := cap.resolve(self.doc), TextItem)
409444
and it.self_ref not in self.get_excluded_refs(**kwargs)
410445
]
411-
text_res = params.caption_delim.join(text_parts)
446+
text_res = params.caption_delim.join([r.text for r in results])
412447
text_res = self.post_process(text=text_res)
413448
else:
414449
text_res = ""
415-
return SerializationResult(text=text_res)
450+
return create_ser_result(text=text_res, span_source=results)

0 commit comments

Comments
 (0)