Skip to content

Commit 437c498

Browse files
dolfim-ibmcau-git
andauthored
feat: utilities converting document formats (#91)
Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent e8f7755 commit 437c498

File tree

8 files changed

+381
-23
lines changed

8 files changed

+381
-23
lines changed

docling_core/types/doc/document.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,7 +1668,7 @@ def iterate_items(
16681668
self,
16691669
root: Optional[NodeItem] = None,
16701670
with_groups: bool = False,
1671-
traverse_pictures: bool = True,
1671+
traverse_pictures: bool = False,
16721672
page_no: Optional[int] = None,
16731673
_level: int = 0, # fixed parameter, carries through the node nesting level
16741674
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
@@ -1685,30 +1685,31 @@ def iterate_items(
16851685
if not root:
16861686
root = self.body
16871687

1688+
# Yield non-group items or group items when with_groups=True
16881689
if not isinstance(root, GroupItem) or with_groups:
16891690
if isinstance(root, DocItem):
1690-
if page_no is not None:
1691-
for prov in root.prov:
1692-
if prov.page_no == page_no:
1693-
yield root, _level
1694-
else:
1691+
if page_no is None or any(
1692+
prov.page_no == page_no for prov in root.prov
1693+
):
16951694
yield root, _level
16961695
else:
16971696
yield root, _level
16981697

1698+
# Handle picture traversal - only traverse children if requested
1699+
if isinstance(root, PictureItem) and not traverse_pictures:
1700+
return
1701+
16991702
# Traverse children
17001703
for child_ref in root.children:
17011704
child = child_ref.resolve(self)
1702-
17031705
if isinstance(child, NodeItem):
1704-
# If the child is a NodeItem, recursively traverse it
1705-
if not isinstance(child, PictureItem) or traverse_pictures:
1706-
yield from self.iterate_items(
1707-
child,
1708-
_level=_level + 1,
1709-
with_groups=with_groups,
1710-
page_no=page_no,
1711-
)
1706+
yield from self.iterate_items(
1707+
child,
1708+
with_groups=with_groups,
1709+
traverse_pictures=traverse_pictures,
1710+
page_no=page_no,
1711+
_level=_level + 1,
1712+
)
17121713

17131714
def _clear_picture_pil_cache(self):
17141715
"""Clear cache storage of all images."""

docling_core/types/legacy_doc/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class BaseCell(AliasModel):
140140
obj_type: str = Field(
141141
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142142
)
143+
payload: Optional[dict] = None
143144

144145
def get_location_tokens(
145146
self,

0 commit comments

Comments
 (0)