Skip to content

Commit 14a4fde

Browse files
authored
feat: remodel lists, add MD & HTML ser. params, enable unset marker (#339)
* feat: remodel lists Signed-off-by: Panos Vagenas <[email protected]> * prepare test document separate markers Signed-off-by: Panos Vagenas <[email protected]> * add MD/HTML serializer options, expand test data Signed-off-by: Panos Vagenas <[email protected]> * create list groups where not in place Signed-off-by: Panos Vagenas <[email protected]> * add auto-increment logic Signed-off-by: Panos Vagenas <[email protected]> * restore UnorderedList as deprecated alias (for backwards compatibility) Signed-off-by: Panos Vagenas <[email protected]> * enable unset marker case Signed-off-by: Panos Vagenas <[email protected]> * make ordered markers able to be "unset" (empty) too Signed-off-by: Panos Vagenas <[email protected]> * rename default marker mode to auto Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent 3eeb259 commit 14a4fde

File tree

60 files changed

+3044
-1690
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+3044
-1690
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,10 @@
3535
DocumentOrigin,
3636
InlineGroup,
3737
LevelNumber,
38-
OrderedList,
38+
ListGroup,
3939
SectionHeaderItem,
4040
TableItem,
4141
TitleItem,
42-
UnorderedList,
4342
)
4443

4544
_VERSION: Final = "1.0.0"
@@ -240,7 +239,7 @@ def chunk(
240239
heading_by_level.pop(k, None)
241240
continue
242241
elif (
243-
isinstance(item, (OrderedList, UnorderedList, InlineGroup, DocItem))
242+
isinstance(item, (ListGroup, InlineGroup, DocItem))
244243
and item.self_ref not in visited
245244
):
246245
ser_res = my_doc_ser.serialize(item=item, visited=visited)

docling_core/transforms/serializer/base.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@
1717
FormItem,
1818
InlineGroup,
1919
KeyValueItem,
20+
ListGroup,
2021
NodeItem,
21-
OrderedList,
2222
PictureItem,
2323
TableItem,
2424
TextItem,
25-
UnorderedList,
2625
)
2726

2827

@@ -128,7 +127,7 @@ class BaseListSerializer(ABC):
128127
def serialize(
129128
self,
130129
*,
131-
item: Union[UnorderedList, OrderedList],
130+
item: ListGroup,
132131
doc_serializer: "BaseDocSerializer",
133132
doc: DoclingDocument,
134133
**kwargs: Any,

docling_core/transforms/serializer/common.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@
3939
FormItem,
4040
InlineGroup,
4141
KeyValueItem,
42+
ListGroup,
4243
NodeItem,
43-
OrderedList,
4444
PictureClassificationData,
4545
PictureDataType,
4646
PictureItem,
@@ -49,7 +49,6 @@
4949
TableAnnotationType,
5050
TableItem,
5151
TextItem,
52-
UnorderedList,
5352
)
5453
from docling_core.types.doc.labels import DocItemLabel
5554

@@ -89,7 +88,7 @@ def _iterate_items(
8988
):
9089
if add_page_breaks:
9190
if (
92-
isinstance(item, (UnorderedList, OrderedList, InlineGroup))
91+
isinstance(item, (ListGroup, InlineGroup))
9392
and item.self_ref not in my_visited
9493
):
9594
# if group starts with new page, yield page break before group node
@@ -316,7 +315,7 @@ def serialize(
316315
########
317316
# groups
318317
########
319-
if isinstance(item, (UnorderedList, OrderedList)):
318+
if isinstance(item, ListGroup):
320319
part = self.list_serializer.serialize(
321320
item=item,
322321
doc_serializer=self,

docling_core/transforms/serializer/doctags.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Define classes for Doctags serialization."""
22

33
from enum import Enum
4-
from typing import Any, Dict, List, Optional, Union
4+
from typing import Any, Dict, List, Optional
55

66
from pydantic import BaseModel
77
from typing_extensions import override
@@ -34,17 +34,16 @@
3434
FormItem,
3535
InlineGroup,
3636
KeyValueItem,
37+
ListGroup,
3738
ListItem,
3839
NodeItem,
39-
OrderedList,
4040
PictureClassificationData,
4141
PictureItem,
4242
PictureMoleculeData,
4343
PictureTabularChartData,
4444
ProvenanceItem,
4545
TableItem,
4646
TextItem,
47-
UnorderedList,
4847
)
4948
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
5049
from docling_core.types.doc.tokens import DocumentToken
@@ -376,7 +375,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
376375
def serialize(
377376
self,
378377
*,
379-
item: Union[UnorderedList, OrderedList],
378+
item: ListGroup,
380379
doc_serializer: "BaseDocSerializer",
381380
doc: DoclingDocument,
382381
list_level: int = 0,
@@ -406,7 +405,7 @@ def serialize(
406405
text_res = f"{text_res}{delim}"
407406
wrap_tag = (
408407
DocumentToken.ORDERED_LIST.value
409-
if isinstance(item, OrderedList)
408+
if item.first_item_is_enumerated(doc)
410409
else DocumentToken.UNORDERED_LIST.value
411410
)
412411
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)

docling_core/transforms/serializer/html.py

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@
5858
ImageRef,
5959
InlineGroup,
6060
KeyValueItem,
61+
ListGroup,
6162
ListItem,
6263
NodeItem,
63-
OrderedList,
6464
PictureClassificationData,
6565
PictureItem,
6666
PictureMoleculeData,
@@ -70,7 +70,6 @@
7070
TableItem,
7171
TextItem,
7272
TitleItem,
73-
UnorderedList,
7473
)
7574
from docling_core.types.doc.labels import DocItemLabel
7675
from docling_core.types.doc.utils import (
@@ -117,6 +116,8 @@ class HTMLParams(CommonParams):
117116

118117
include_annotations: bool = True
119118

119+
show_original_list_item_marker: bool = True
120+
120121

121122
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
122123
"""HTML-specific text item serializer."""
@@ -162,7 +163,19 @@ def serialize(
162163
elif isinstance(item, ListItem):
163164
# List items are handled by list serializer
164165
text_inner = self._prepare_content(item.text)
165-
text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
166+
text = (
167+
get_html_tag_with_text_direction(
168+
html_tag="li",
169+
text=text_inner,
170+
attrs=(
171+
{"style": f"list-style-type: '{item.marker} ';"}
172+
if params.show_original_list_item_marker and item.marker
173+
else {}
174+
),
175+
)
176+
if text_inner
177+
else ""
178+
)
166179

167180
elif is_inline_scope:
168181
text = self._prepare_content(item.text)
@@ -680,7 +693,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
680693
def serialize(
681694
self,
682695
*,
683-
item: Union[UnorderedList, OrderedList],
696+
item: ListGroup,
684697
doc_serializer: "BaseDocSerializer",
685698
doc: DoclingDocument,
686699
list_level: int = 0,
@@ -690,7 +703,7 @@ def serialize(
690703
) -> SerializationResult:
691704
"""Serializes a list to HTML."""
692705
my_visited: set[str] = visited if visited is not None else set()
693-
706+
params = HTMLParams(**kwargs)
694707
# Get all child parts
695708
parts = doc_serializer.get_parts(
696709
item=item,
@@ -706,17 +719,51 @@ def serialize(
706719
(
707720
p.text
708721
if (
709-
(p.text.startswith("<li>") and p.text.endswith("</li>"))
710-
or (p.text.startswith("<ol>") and p.text.endswith("</ol>"))
711-
or (p.text.startswith("<ul>") and p.text.endswith("</ul>"))
722+
(
723+
p.text.startswith(("<li>", "<li "))
724+
and p.text.endswith("</li>")
725+
)
726+
or (
727+
p.text.startswith(("<ol>", "<ol "))
728+
and p.text.endswith("</ol>")
729+
)
730+
or (
731+
p.text.startswith(("<ul>", "<ul "))
732+
and p.text.endswith("</ul>")
733+
)
734+
)
735+
else (
736+
get_html_tag_with_text_direction(
737+
html_tag="li",
738+
text=p.text,
739+
attrs=(
740+
{
741+
"style": f"list-style-type: '{grandparent_item.marker} ';"
742+
}
743+
if params.show_original_list_item_marker
744+
and grandparent_item.marker
745+
else {}
746+
),
747+
)
748+
if p.spans
749+
and p.spans[0].item.parent
750+
and isinstance(
751+
(parent_item := p.spans[0].item.parent.resolve(doc)),
752+
InlineGroup,
753+
)
754+
and parent_item.parent
755+
and isinstance(
756+
(grandparent_item := parent_item.parent.resolve(doc)),
757+
ListItem,
758+
)
759+
else f"<li>{p.text}</li>"
712760
)
713-
else f"<li>{p.text}</li>"
714761
)
715762
for p in parts
716763
]
717764
)
718765
if text_res:
719-
tag = "ol" if isinstance(item, OrderedList) else "ul"
766+
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
720767
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
721768

722769
return create_ser_result(text=text_res, span_source=parts)

0 commit comments

Comments
 (0)