Skip to content

Commit 7eb9fa9

Browse files
authored
fix: include captions regardless of traverse_pictures flag (#278)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 4b967ab commit 7eb9fa9

File tree

4 files changed

+96
-82
lines changed

4 files changed

+96
-82
lines changed

docling_core/types/doc/document.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2657,16 +2657,25 @@ def _iterate_items_with_stack(
26572657
if should_yield:
26582658
yield root, my_stack
26592659

2660-
# Handle picture traversal - only traverse children if requested
2661-
if isinstance(root, PictureItem) and not traverse_pictures:
2662-
return
2663-
26642660
my_stack.append(-1)
26652661

2662+
allowed_pic_refs: set[str] = (
2663+
{r.cref for r in root.captions}
2664+
if (root_is_picture := isinstance(root, PictureItem))
2665+
else set()
2666+
)
2667+
26662668
# Traverse children
26672669
for child_ind, child_ref in enumerate(root.children):
2668-
my_stack[-1] = child_ind
26692670
child = child_ref.resolve(self)
2671+
if (
2672+
root_is_picture
2673+
and not traverse_pictures
2674+
and isinstance(child, DocItem)
2675+
and child.self_ref not in allowed_pic_refs
2676+
):
2677+
continue
2678+
my_stack[-1] = child_ind
26702679

26712680
if isinstance(child, NodeItem):
26722681
yield from self._iterate_items_with_stack(

test/data/doc/2206.01062.yaml.et

Lines changed: 82 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -15,106 +15,106 @@
1515
14: text
1616
15: text
1717
16: picture
18-
17: section_header
19-
18: text
20-
19: section_header
21-
20: text
22-
21: section_header
23-
22: text
18+
17: caption
19+
18: section_header
20+
19: text
21+
20: section_header
22+
21: text
23+
22: section_header
2424
23: text
2525
24: text
26-
25: list with name=list
27-
26: list_item
26+
25: text
27+
26: list with name=list
2828
27: list_item
2929
28: list_item
3030
29: list_item
31-
30: footnote
32-
31: text
33-
32: list with name=list
34-
33: list_item
35-
34: text
31+
30: list_item
32+
31: footnote
33+
32: text
34+
33: list with name=list
35+
34: list_item
3636
35: text
37-
36: section_header
38-
37: text
37+
36: text
38+
37: section_header
3939
38: text
40-
39: section_header
41-
40: text
40+
39: text
41+
40: section_header
4242
41: text
43-
42: picture
44-
43: text
45-
44: text
43+
42: text
44+
43: picture
45+
44: caption
4646
45: text
4747
46: text
48-
47: footnote
48+
47: text
4949
48: text
50-
49: text
50+
49: footnote
5151
50: text
52-
51: section_header
52+
51: text
5353
52: text
54-
53: table
55-
54: caption
56-
55: picture
57-
56: text
58-
57: text
59-
58: text
54+
53: section_header
55+
54: text
56+
55: table
57+
56: caption
58+
57: picture
59+
58: caption
6060
59: text
61-
60: footnote
61+
60: text
6262
61: text
6363
62: text
64-
63: text
65-
64: list with name=list
66-
65: list_item
67-
66: list_item
68-
67: list_item
64+
63: footnote
65+
64: text
66+
65: text
67+
66: text
68+
67: list with name=list
6969
68: list_item
7070
69: list_item
7171
70: list_item
72-
71: text
73-
72: text
74-
73: picture
72+
71: list_item
73+
72: list_item
74+
73: list_item
7575
74: text
76-
75: caption
77-
76: text
76+
75: text
77+
76: picture
7878
77: text
79-
78: text
80-
79: table
79+
78: caption
80+
79: text
8181
80: text
82-
81: section_header
83-
82: text
84-
83: picture
85-
84: text
82+
81: text
83+
82: table
84+
83: text
85+
84: section_header
8686
85: text
87-
86: section_header
88-
87: text
87+
86: picture
88+
87: caption
8989
88: text
90-
89: table
90+
89: text
9191
90: section_header
9292
91: text
93-
92: section_header
94-
93: text
95-
94: text
96-
95: table
97-
96: text
98-
97: section_header
93+
92: text
94+
93: table
95+
94: section_header
96+
95: text
97+
96: section_header
98+
97: text
9999
98: text
100-
99: section_header
100+
99: table
101101
100: text
102-
101: text
103-
102: table
104-
103: text
102+
101: section_header
103+
102: text
104+
103: section_header
105105
104: text
106-
105: section_header
107-
106: text
108-
107: section_header
106+
105: text
107+
106: table
108+
107: text
109109
108: text
110-
109: text
110+
109: section_header
111111
110: text
112112
111: section_header
113-
112: list with name=list
114-
113: list_item
115-
114: list_item
116-
115: list_item
117-
116: list_item
113+
112: text
114+
113: text
115+
114: text
116+
115: section_header
117+
116: list with name=list
118118
117: list_item
119119
118: list_item
120120
119: list_item
@@ -124,17 +124,22 @@
124124
123: list_item
125125
124: list_item
126126
125: list_item
127-
126: picture
128-
127: text
129-
128: text
130-
129: list with name=list
131-
130: list_item
132-
131: list_item
133-
132: list_item
134-
133: list_item
135-
134: list_item
127+
126: list_item
128+
127: list_item
129+
128: list_item
130+
129: list_item
131+
130: picture
132+
131: caption
133+
132: text
134+
133: text
135+
134: list with name=list
136136
135: list_item
137137
136: list_item
138138
137: list_item
139139
138: list_item
140140
139: list_item
141+
140: list_item
142+
141: list_item
143+
142: list_item
144+
143: list_item
145+
144: list_item
-172 Bytes
Loading
-1.11 KB
Loading

0 commit comments

Comments
 (0)