Skip to content

Commit 511fb98

Browse files
authored
fix: update deserialization for better recovery (#282)
update deserialization for better recovery Signed-off-by: Saidgurbuz <[email protected]>
1 parent 2f0f121 commit 511fb98

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

docling_core/types/doc/document.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3606,7 +3606,9 @@ def parse_key_value_item(
36063606
rf"{DocumentToken.UNORDERED_LIST.value}|"
36073607
rf"{DocItemLabel.KEY_VALUE_REGION}|"
36083608
rf"{DocumentToken.CHART.value}|"
3609-
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3609+
rf"{DocumentToken.OTSL.value})>"
3610+
rf"(?P<content>.*?)"
3611+
rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
36103612
)
36113613
pattern = re.compile(tag_pattern, re.DOTALL)
36123614

@@ -3616,6 +3618,10 @@ def parse_key_value_item(
36163618
tag_name = match.group("tag")
36173619

36183620
bbox = extract_bounding_box(full_chunk) # Extracts first bbox
3621+
if not match.group("closed"):
3622+
# no closing tag; only the existence of the item is recovered
3623+
full_chunk = f"<{tag_name}></{tag_name}>"
3624+
36193625
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
36203626

36213627
if tag_name == DocumentToken.OTSL.value:

0 commit comments

Comments
 (0)