Skip to content

Commit 574779f

Browse files
authored
chore: add ref number normalization (#322)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 5c99722 commit 574779f

File tree

4 files changed

+157
-1
lines changed

4 files changed

+157
-1
lines changed

docling_core/types/doc/document.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4459,3 +4459,67 @@ def validate_misplaced_list_items(self):
44594459
hyperlink=li.hyperlink,
44604460
)
44614461
return self
4462+
4463+
def _normalize_references(self) -> None:
4464+
"""Normalize ref numbering by ordering node items as per iterate_items()."""
4465+
new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
4466+
4467+
item_lists: dict[str, list[NodeItem]] = {
4468+
"groups": [],
4469+
"texts": [],
4470+
"pictures": [],
4471+
"tables": [],
4472+
"key_value_items": [],
4473+
"form_items": [],
4474+
}
4475+
orig_ref_to_new_ref: dict[str, str] = {}
4476+
4477+
# collect items in traversal order
4478+
for item, _ in self.iterate_items(
4479+
with_groups=True,
4480+
traverse_pictures=True,
4481+
included_content_layers={c for c in ContentLayer},
4482+
):
4483+
key = item.self_ref.split("/")[1]
4484+
is_body = key == "body"
4485+
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4486+
# register cref mapping:
4487+
orig_ref_to_new_ref[item.self_ref] = new_cref
4488+
4489+
if not is_body:
4490+
new_item = copy.deepcopy(item)
4491+
new_item.children = []
4492+
4493+
# put item in the right list
4494+
item_lists[key].append(new_item)
4495+
4496+
# update item's self reference
4497+
new_item.self_ref = new_cref
4498+
4499+
if item.parent:
4500+
# set item's parent
4501+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4502+
new_item.parent = RefItem(cref=new_parent_cref)
4503+
4504+
# add item to parent's children
4505+
path_components = new_parent_cref.split("/")
4506+
num_components = len(path_components)
4507+
parent_node: NodeItem
4508+
if num_components == 3:
4509+
_, parent_key, parent_index_str = path_components
4510+
parent_index = int(parent_index_str)
4511+
parent_node = item_lists[parent_key][parent_index]
4512+
elif num_components == 2 and path_components[1] == "body":
4513+
parent_node = new_body
4514+
else:
4515+
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4516+
parent_node.children.append(RefItem(cref=new_cref))
4517+
4518+
# update document
4519+
self.groups = item_lists["groups"] # type: ignore
4520+
self.texts = item_lists["texts"] # type: ignore
4521+
self.pictures = item_lists["pictures"] # type: ignore
4522+
self.tables = item_lists["tables"] # type: ignore
4523+
self.key_value_items = item_lists["key_value_items"] # type: ignore
4524+
self.form_items = item_lists["form_items"] # type: ignore
4525+
self.body = new_body

test/data/doc/dummy_doc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ texts:
9898

9999

100100
tables: # All tables...
101-
- self_ref: "#/table/0"
101+
- self_ref: "#/tables/0"
102102
label: "table"
103103
parent:
104104
$ref: "#/body"
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
body:
2+
children:
3+
- $ref: '#/groups/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/groups/1'
6+
content_layer: body
7+
label: unspecified
8+
name: _root_
9+
self_ref: '#/body'
10+
form_items: []
11+
furniture:
12+
children: []
13+
content_layer: furniture
14+
label: unspecified
15+
name: _root_
16+
self_ref: '#/furniture'
17+
groups:
18+
- children:
19+
- $ref: '#/texts/0'
20+
content_layer: body
21+
label: ordered_list
22+
name: group
23+
parent:
24+
$ref: '#/body'
25+
self_ref: '#/groups/0'
26+
- children:
27+
- $ref: '#/texts/2'
28+
- $ref: '#/texts/3'
29+
content_layer: body
30+
label: list
31+
name: group
32+
parent:
33+
$ref: '#/body'
34+
self_ref: '#/groups/1'
35+
key_value_items: []
36+
name: ''
37+
pages: {}
38+
pictures: []
39+
schema_name: DoclingDocument
40+
tables: []
41+
texts:
42+
- children: []
43+
content_layer: body
44+
enumerated: true
45+
label: list_item
46+
marker: '1.'
47+
orig: foo
48+
parent:
49+
$ref: '#/groups/0'
50+
prov: []
51+
self_ref: '#/texts/0'
52+
text: foo
53+
- children: []
54+
content_layer: body
55+
label: text
56+
orig: bar
57+
parent:
58+
$ref: '#/body'
59+
prov: []
60+
self_ref: '#/texts/1'
61+
text: bar
62+
- children: []
63+
content_layer: body
64+
enumerated: false
65+
label: list_item
66+
marker: '-'
67+
orig: here
68+
parent:
69+
$ref: '#/groups/1'
70+
prov: []
71+
self_ref: '#/texts/2'
72+
text: here
73+
- children: []
74+
content_layer: body
75+
enumerated: false
76+
label: list_item
77+
marker: '-'
78+
orig: there
79+
parent:
80+
$ref: '#/groups/1'
81+
prov: []
82+
self_ref: '#/texts/3'
83+
text: there
84+
version: 1.4.0

test/test_docling_doc.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,3 +1645,11 @@ def test_misplaced_list_items():
16451645
else:
16461646
exp_doc = DoclingDocument.load_from_yaml(exp_file)
16471647
assert doc == exp_doc
1648+
1649+
doc._normalize_references()
1650+
exp_file = filename.parent / f"{filename.stem}.norm.out.yaml"
1651+
if GEN_TEST_DATA:
1652+
doc.save_as_yaml(exp_file)
1653+
else:
1654+
exp_doc = DoclingDocument.load_from_yaml(exp_file)
1655+
assert doc == exp_doc

0 commit comments

Comments
 (0)