Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5668,6 +5668,7 @@ def _export_to_indented_text(
):
"""Export the document to indented text to expose hierarchy."""
result = []
item_counter = 0

def get_text(text: str, max_text_len: int):

Expand All @@ -5687,59 +5688,72 @@ def get_text(text: str, max_text_len: int):
if isinstance(item, GroupItem):
result.append(
indent * level
+ f"item-{i} at level {level}: {item.label}: group {item.name}"
+ f"item-{item_counter} at level {level}: {item.label}: group {item.name}"
)
item_counter += 1

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
text = get_text(text=item.text, max_text_len=max_text_len)

result.append(
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}: {text}"
)
item_counter += 1

elif isinstance(item, SectionHeaderItem):
text = get_text(text=item.text, max_text_len=max_text_len)

result.append(
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}: {text}"
)
item_counter += 1

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
text = get_text(text=item.text, max_text_len=max_text_len)

result.append(
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}: {text}"
)
item_counter += 1

elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
text = get_text(text=item.text, max_text_len=max_text_len)

result.append(
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}: {text}"
)
item_counter += 1

elif isinstance(item, TextItem):
text = get_text(text=item.text, max_text_len=max_text_len)

result.append(
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}: {text}"
)
item_counter += 1

elif isinstance(item, TableItem):

result.append(
indent * level
+ f"item-{i} at level {level}: {item.label} with "
+ f"item-{item_counter} at level {level}: {item.label} with "
+ f"[{item.data.num_rows}x{item.data.num_cols}]"
)
item_counter += 1

for _ in item.captions:
caption = _.resolve(self)
result.append(
indent * (level + 1)
+ f"item-{i} at level {level + 1}: {caption.label}: "
+ f"item-{item_counter} at level {level + 1}: {caption.label}: "
+ f"{caption.text}"
)
item_counter += 1

if explicit_tables:
grid: list[list[str]] = []
Expand All @@ -5757,22 +5771,26 @@ def get_text(text: str, max_text_len: int):
elif isinstance(item, PictureItem):

result.append(
indent * level + f"item-{i} at level {level}: {item.label}"
indent * level
+ f"item-{item_counter} at level {level}: {item.label}"
)
item_counter += 1

for _ in item.captions:
caption = _.resolve(self)
result.append(
indent * (level + 1)
+ f"item-{i} at level {level + 1}: {caption.label}: "
+ f"item-{item_counter} at level {level + 1}: {caption.label}: "
+ f"{caption.text}"
)
item_counter += 1

elif isinstance(item, DocItem):
result.append(
indent * (level + 1)
+ f"item-{i} at level {level}: {item.label}: ignored"
+ f"item-{item_counter} at level {level}: {item.label}: ignored"
)
item_counter += 1

return "\n".join(result)

Expand Down
75 changes: 75 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,3 +2349,78 @@ def test_filter_pages():
with open(exp_html_file, "r", encoding="utf-8") as f:
exp_html_data = f.read()
assert html_data == exp_html_data


def test_item_counter_numbering_with_captions():
"""Test that captions have unique item numbers in export_to_indented_text."""
doc = DoclingDocument(name="Test Caption Numbering")

doc.add_text(label=DocItemLabel.TEXT, text="First paragraph")

table_caption = doc.add_text(
label=DocItemLabel.CAPTION, text="Table 1: Sample table"
)
table_cells = [
TableCell(
start_row_offset_idx=0,
end_row_offset_idx=1,
start_col_offset_idx=0,
end_col_offset_idx=1,
text="Cell 1",
),
TableCell(
start_row_offset_idx=0,
end_row_offset_idx=1,
start_col_offset_idx=1,
end_col_offset_idx=2,
text="Cell 2",
),
]
table_data = TableData(num_rows=1, num_cols=2, table_cells=table_cells)
doc.add_table(data=table_data, caption=table_caption)

doc.add_text(label=DocItemLabel.TEXT, text="Second paragraph")

pic_caption = doc.add_text(
label=DocItemLabel.CAPTION, text="Figure 1: Sample image"
)
doc.add_picture(caption=pic_caption)

doc.add_text(label=DocItemLabel.TEXT, text="Third paragraph")

indented_text = doc._export_to_indented_text()

import re

item_pattern = re.compile(r"item-(\d+) at level")
item_numbers = []

lines = indented_text.split("\n")
for line in lines:
match = item_pattern.search(line)
if match:
item_numbers.append(int(match.group(1)))

assert len(item_numbers) > 0, "No items found in indented text output"

caption_indices = [
i
for i, line in enumerate(lines)
if "CAPTION" in line or "caption" in line.lower()
]
assert len(caption_indices) >= 2, "Expected at least 2 captions in output"

for i in range(len(item_numbers) - 1):
current_num = item_numbers[i]
next_num = item_numbers[i + 1]

assert next_num > current_num, (
f"Item numbers should be strictly increasing: "
f"item-{current_num} followed by item-{next_num}"
)

unique_item_numbers = set(item_numbers)
assert len(unique_item_numbers) == len(item_numbers), (
f"All item numbers should be unique. "
f"Found {len(item_numbers)} items but only {len(unique_item_numbers)} unique numbers"
)