Skip to content

Commit defd49e

Browse files
authored
fix: Support section_header levels in doctags deserialization (#313)
Adding support for section_header levels in doctags deserialization Signed-off-by: Christoph Auer <[email protected]>
1 parent 71956ed commit defd49e

File tree

1 file changed

+23
-7
lines changed

1 file changed

+23
-7
lines changed

docling_core/types/doc/document.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3237,6 +3237,11 @@ def load_from_doctags( # noqa: C901
32373237
"document_index": DocItemLabel.DOCUMENT_INDEX,
32383238
"otsl": DocItemLabel.TABLE,
32393239
"section_header_level_1": DocItemLabel.SECTION_HEADER,
3240+
"section_header_level_2": DocItemLabel.SECTION_HEADER,
3241+
"section_header_level_3": DocItemLabel.SECTION_HEADER,
3242+
"section_header_level_4": DocItemLabel.SECTION_HEADER,
3243+
"section_header_level_5": DocItemLabel.SECTION_HEADER,
3244+
"section_header_level_6": DocItemLabel.SECTION_HEADER,
32403245
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
32413246
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
32423247
"text": DocItemLabel.TEXT,
@@ -3622,7 +3627,7 @@ def parse_key_value_item(
36223627
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
36233628
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
36243629
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
3625-
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
3630+
rf"{DocItemLabel.SECTION_HEADER}_level_[1-6]|"
36263631
rf"{DocumentToken.ORDERED_LIST.value}|"
36273632
rf"{DocumentToken.UNORDERED_LIST.value}|"
36283633
rf"{DocItemLabel.KEY_VALUE_REGION}|"
@@ -3830,12 +3835,23 @@ def parse_key_value_item(
38303835
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
38313836
content_layer = ContentLayer.FURNITURE
38323837

3833-
doc.add_text(
3834-
label=doc_label,
3835-
text=text_content,
3836-
prov=element_prov,
3837-
content_layer=content_layer,
3838-
)
3838+
if doc_label == DocItemLabel.SECTION_HEADER:
3839+
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
3840+
level = int(tag_name.split("_")[-1])
3841+
doc.add_heading(
3842+
text=text_content,
3843+
level=level,
3844+
prov=element_prov,
3845+
content_layer=content_layer,
3846+
)
3847+
else:
3848+
doc.add_text(
3849+
label=doc_label,
3850+
text=text_content,
3851+
prov=element_prov,
3852+
content_layer=content_layer,
3853+
)
3854+
38393855
return doc
38403856

38413857
@deprecated("Use save_as_doctags instead.")

0 commit comments

Comments
 (0)