Skip to content

Commit ff13a93

Browse files
authored
feat: expose MD page break & DocTags minification (#213)
* feat: expose MD page break & DocTags minification Also: - aligned DocTags method naming - fixed mutable default arguments Signed-off-by: Panos Vagenas <[email protected]> * cover newly exposed params in testing Signed-off-by: Panos Vagenas <[email protected]> * drop DocTags-specific post-processing Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent db119f4 commit ff13a93

File tree

4 files changed

+451
-76
lines changed

4 files changed

+451
-76
lines changed

docling_core/experimental/serializer/doctags.py

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
"""Define classes for Doctags serialization."""
22

3-
import html
43
from enum import Enum
5-
from pathlib import Path
64
from typing import Dict, List, Optional, Union
75

8-
from pydantic import AnyUrl, BaseModel
6+
from pydantic import BaseModel
97
from typing_extensions import override
108

119
from docling_core.experimental.serializer.base import (
@@ -25,7 +23,6 @@
2523
CodeItem,
2624
DocItem,
2725
DoclingDocument,
28-
Formatting,
2926
FormItem,
3027
InlineGroup,
3128
KeyValueItem,
@@ -112,7 +109,6 @@ def serialize(
112109
text_part = item.text
113110
text_part = doc_serializer.post_process(
114111
text=text_part,
115-
escape_html=False, # TODO review
116112
formatting=item.formatting,
117113
hyperlink=item.hyperlink,
118114
)
@@ -456,26 +452,6 @@ class DocTagsDocSerializer(DocSerializer):
456452

457453
params: DocTagsParams = DocTagsParams()
458454

459-
def post_process(
460-
self,
461-
text: str,
462-
*,
463-
escape_html: bool = True,
464-
formatting: Optional[Formatting] = None,
465-
hyperlink: Optional[Union[AnyUrl, Path]] = None,
466-
**kwargs,
467-
) -> str:
468-
"""Apply some text post-processing steps."""
469-
res = text
470-
if escape_html:
471-
res = html.escape(res, quote=False)
472-
res = super().post_process(
473-
text=res,
474-
formatting=formatting,
475-
hyperlink=hyperlink,
476-
)
477-
return res
478-
479455
@override
480456
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
481457
"""Serialize a page out of its parts."""

0 commit comments

Comments
 (0)