Skip to content

Commit a7cdc87

Browse files
authored
feat: add serializers, text formatting, update Markdown export (#182)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent a86a4a3 commit a7cdc87

32 files changed

+3200
-359
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2025
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Experimental features."""
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Define the serializer types."""
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2025
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Define base classes for serialization."""
7+
from abc import ABC, abstractmethod
8+
from pathlib import Path
9+
from typing import Optional, Union
10+
11+
from pydantic import AnyUrl, BaseModel
12+
13+
from docling_core.types.doc.document import (
14+
DoclingDocument,
15+
FloatingItem,
16+
FormItem,
17+
InlineGroup,
18+
KeyValueItem,
19+
NodeItem,
20+
OrderedList,
21+
PictureItem,
22+
TableItem,
23+
TextItem,
24+
UnorderedList,
25+
)
26+
27+
28+
class SerializationResult(BaseModel):
29+
"""SerializationResult."""
30+
31+
text: str
32+
33+
34+
class BaseTextSerializer(ABC):
35+
"""Base class for text item serializers."""
36+
37+
@abstractmethod
38+
def serialize(
39+
self,
40+
*,
41+
item: TextItem,
42+
doc_serializer: "BaseDocSerializer",
43+
doc: DoclingDocument,
44+
**kwargs,
45+
) -> SerializationResult:
46+
"""Serializes the passed item."""
47+
...
48+
49+
50+
class BaseTableSerializer(ABC):
51+
"""Base class for table item serializers."""
52+
53+
@abstractmethod
54+
def serialize(
55+
self,
56+
*,
57+
item: TableItem,
58+
doc_serializer: "BaseDocSerializer",
59+
doc: DoclingDocument,
60+
**kwargs,
61+
) -> SerializationResult:
62+
"""Serializes the passed item."""
63+
...
64+
65+
66+
class BasePictureSerializer(ABC):
67+
"""Base class for picture item serializers."""
68+
69+
@abstractmethod
70+
def serialize(
71+
self,
72+
*,
73+
item: PictureItem,
74+
doc_serializer: "BaseDocSerializer",
75+
doc: DoclingDocument,
76+
**kwargs,
77+
) -> SerializationResult:
78+
"""Serializes the passed item."""
79+
...
80+
81+
82+
class BaseKeyValueSerializer(ABC):
83+
"""Base class for key value item serializers."""
84+
85+
@abstractmethod
86+
def serialize(
87+
self,
88+
*,
89+
item: KeyValueItem,
90+
doc_serializer: "BaseDocSerializer",
91+
doc: DoclingDocument,
92+
**kwargs,
93+
) -> SerializationResult:
94+
"""Serializes the passed item."""
95+
...
96+
97+
98+
class BaseFormSerializer(ABC):
99+
"""Base class for form item serializers."""
100+
101+
@abstractmethod
102+
def serialize(
103+
self,
104+
*,
105+
item: FormItem,
106+
doc_serializer: "BaseDocSerializer",
107+
doc: DoclingDocument,
108+
**kwargs,
109+
) -> SerializationResult:
110+
"""Serializes the passed item."""
111+
...
112+
113+
114+
class BaseListSerializer(ABC):
115+
"""Base class for list serializers."""
116+
117+
@abstractmethod
118+
def serialize(
119+
self,
120+
*,
121+
item: Union[UnorderedList, OrderedList],
122+
doc_serializer: "BaseDocSerializer",
123+
doc: DoclingDocument,
124+
**kwargs,
125+
) -> SerializationResult:
126+
"""Serializes the passed item."""
127+
...
128+
129+
130+
class BaseInlineSerializer(ABC):
131+
"""Base class for inline serializers."""
132+
133+
@abstractmethod
134+
def serialize(
135+
self,
136+
*,
137+
item: InlineGroup,
138+
doc_serializer: "BaseDocSerializer",
139+
doc: DoclingDocument,
140+
**kwargs,
141+
) -> SerializationResult:
142+
"""Serializes the passed item."""
143+
...
144+
145+
146+
class BaseFallbackSerializer(ABC):
147+
"""Base fallback class for item serializers."""
148+
149+
@abstractmethod
150+
def serialize(
151+
self,
152+
*,
153+
item: NodeItem,
154+
doc_serializer: "BaseDocSerializer",
155+
doc: DoclingDocument,
156+
**kwargs,
157+
) -> SerializationResult:
158+
"""Serializes the passed item."""
159+
...
160+
161+
162+
class BaseDocSerializer(ABC):
163+
"""Base class for document serializers."""
164+
165+
@abstractmethod
166+
def serialize(self, **kwargs) -> SerializationResult:
167+
"""Run the serialization."""
168+
...
169+
170+
@abstractmethod
171+
def serialize_bold(self, text: str, **kwargs) -> str:
172+
"""Hook for bold formatting serialization."""
173+
...
174+
175+
@abstractmethod
176+
def serialize_italic(self, text: str, **kwargs) -> str:
177+
"""Hook for italic formatting serialization."""
178+
...
179+
180+
@abstractmethod
181+
def serialize_underline(self, text: str, **kwargs) -> str:
182+
"""Hook for underline formatting serialization."""
183+
...
184+
185+
@abstractmethod
186+
def serialize_strikethrough(self, text: str, **kwargs) -> str:
187+
"""Hook for strikethrough formatting serialization."""
188+
...
189+
190+
@abstractmethod
191+
def serialize_hyperlink(
192+
self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
193+
) -> str:
194+
"""Hook for hyperlink serialization."""
195+
...
196+
197+
@abstractmethod
198+
def get_parts(
199+
self,
200+
node: Optional[NodeItem] = None,
201+
**kwargs,
202+
) -> list[SerializationResult]:
203+
"""Get the components to be combined for serializing this node."""
204+
...
205+
206+
@abstractmethod
207+
def post_process(
208+
self,
209+
text: str,
210+
**kwargs,
211+
) -> str:
212+
"""Apply some text post-processing steps."""
213+
...
214+
215+
@abstractmethod
216+
def serialize_captions(
217+
self,
218+
item: FloatingItem,
219+
**kwargs,
220+
) -> SerializationResult:
221+
"""Serialize the item's captions."""
222+
...
223+
224+
@abstractmethod
225+
def get_excluded_refs(self) -> list[str]:
226+
"""Get references to excluded items."""
227+
...

0 commit comments

Comments
 (0)