Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Type, Union

import numpy as np
Expand All @@ -20,6 +21,7 @@
# DO NOT REMOVE; explicitly exposed from this location
from PIL.Image import Image
from pydantic import (
AnyUrl,
BaseModel,
ConfigDict,
Field,
Expand Down Expand Up @@ -251,6 +253,7 @@ class TableStructurePrediction(BaseModel):

class TextElement(BasePageElement):
text: str
hyperlink: Optional[Union[AnyUrl, Path]] = None


class FigureElement(BasePageElement):
Expand Down
55 changes: 53 additions & 2 deletions docling/models/stages/page_assemble/page_assemble_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
import re
from collections.abc import Iterable
from typing import Dict, List
from pathlib import Path
from typing import Dict, List, Optional, Union

import numpy as np
from pydantic import BaseModel
from docling_core.types.doc import BoundingBox
from pydantic import AnyUrl, BaseModel, ValidationError

from docling.datamodel.base_models import (
AssembledUnit,
Expand Down Expand Up @@ -43,9 +45,56 @@ class PageAssembleOptions(BaseModel):


class PageAssembleModel(BasePageModel):
# Minimum fraction of a cluster's area that a hyperlink rect must cover
# to be considered a match (avoids false positives from adjacent links).
_HYPERLINK_COVERAGE_THRESHOLD = 0.5

def __init__(self, options: PageAssembleOptions):
self.options = options

@staticmethod
def _match_hyperlink(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hussainarslan For the hyperlinks that are not matched, it might be nice to simply propagate them with a different context-layer?

With this approach, You might lose hyperlinks that were not matched.

Copy link
Author

@hussainarslan hussainarslan Mar 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PeterStaar-IBM Great idea! The FURNITURE or NOTES content-layer could work well for missed hyperlinks. If that sounds reasonable, I can add it. But happy to adjust if you have another suggestion.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hussainarslan For the hyperlinks that are not matched, it might be nice to simply propagate them with a different context-layer?

With this approach, You might lose hyperlinks that were not matched.

@PeterStaar-IBM I have added logic that recovers those unmatched hyperlinks as reference items. Let me know if that works.

cluster_bbox: BoundingBox,
page: Page,
) -> Optional[Union[AnyUrl, Path]]:
"""Pick the hyperlink annotation with the highest spatial overlap on cluster_bbox.

Hyperlink rects are BOTTOMLEFT-origin; cluster bboxes are TOPLEFT-origin.
"""
if page.parsed_page is None or not page.parsed_page.hyperlinks:
return None

if page.size is None:
return None

page_height = page.size.height

# Accumulate coverage per URI — a single hyperlink may span multiple
# annotation rectangles (e.g. a URL that wraps across lines).
coverage_by_uri: Dict[str, float] = {}

for hl in page.parsed_page.hyperlinks:
if hl.uri is None:
continue

uri_str = str(hl.uri)
hl_bbox = hl.rect.to_bounding_box().to_top_left_origin(page_height)
coverage_by_uri[uri_str] = coverage_by_uri.get(
uri_str, 0.0
) + cluster_bbox.intersection_over_self(hl_bbox)

if not coverage_by_uri:
return None

best_uri = max(coverage_by_uri.items(), key=lambda x: x[1])[0]
if coverage_by_uri[best_uri] < PageAssembleModel._HYPERLINK_COVERAGE_THRESHOLD:
return None

try:
return AnyUrl(best_uri)
except ValidationError:
return Path(best_uri)

def sanitize_text(self, lines):
if len(lines) == 0:
return ""
Expand Down Expand Up @@ -111,10 +160,12 @@ def __call__(
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
hyperlink = self._match_hyperlink(cluster.bbox, page)
text_el = TextElement(
label=cluster.label,
id=cluster.id,
text=text,
hyperlink=hyperlink,
page_no=page.page_no,
cluster=cluster,
)
Expand Down
20 changes: 17 additions & 3 deletions docling/models/stages/reading_order/readingorder_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,11 @@ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
)
new_item = out_doc.add_text(
label=elem.label, text=text, prov=prov, parent=parent
label=elem.label,
text=text,
prov=prov,
parent=parent,
hyperlink=elem.hyperlink,
)
return new_item

Expand All @@ -366,14 +370,20 @@ def _handle_text_element(self, element, out_doc, current_list, page_height):

# TODO: Infer if this is a numbered or a bullet list item
new_item = out_doc.add_list_item(
text=cap_text, enumerated=False, prov=prov, parent=current_list
text=cap_text,
enumerated=False,
prov=prov,
parent=current_list,
hyperlink=element.hyperlink,
)
self.list_item_processor.process_list_item(new_item)

elif label == DocItemLabel.SECTION_HEADER:
current_list = None

new_item = out_doc.add_heading(text=cap_text, prov=prov)
new_item = out_doc.add_heading(
text=cap_text, prov=prov, hyperlink=element.hyperlink
)
elif label == DocItemLabel.FORMULA:
current_list = None

Expand All @@ -392,6 +402,7 @@ def _handle_text_element(self, element, out_doc, current_list, page_height):
text=cap_text,
prov=prov,
content_layer=content_layer,
hyperlink=element.hyperlink,
)
return new_item, current_list

Expand All @@ -414,6 +425,9 @@ def _merge_elements(self, element, merged_elem, new_item, page_height):
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
new_item.prov.append(prov)

if new_item.hyperlink != merged_elem.hyperlink:
new_item.hyperlink = None

def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
page_elements = self._assembled_to_readingorder_elements(conv_res)
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/2203.01017v2.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "2203.01017v2",
"origin": {
"mimetype": "application/pdf",
Expand Down
8 changes: 5 additions & 3 deletions tests/data/groundtruth/docling_v2/2206.01062.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "2206.01062",
"origin": {
"mimetype": "application/pdf",
Expand Down Expand Up @@ -9609,7 +9609,8 @@
}
],
"orig": "1 https://developer.ibm.com/exchanges/data/all/doclaynet",
"text": "1 https://developer.ibm.com/exchanges/data/all/doclaynet"
"text": "1 https://developer.ibm.com/exchanges/data/all/doclaynet",
"hyperlink": "https://developer.ibm.com/exchanges/data/all/doclaynet"
},
{
"self_ref": "#/texts/335",
Expand Down Expand Up @@ -10789,7 +10790,8 @@
}
],
"orig": "3 https://arxiv.org/",
"text": "3 https://arxiv.org/"
"text": "3 https://arxiv.org/",
"hyperlink": "https://arxiv.org/"
},
{
"self_ref": "#/texts/378",
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/2206.01062.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ In this paper, we present the DocLayNet dataset. It provides pageby-page layout
- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.

1 https://developer.ibm.com/exchanges/data/all/doclaynet
[1 https://developer.ibm.com/exchanges/data/all/doclaynet](https://developer.ibm.com/exchanges/data/all/doclaynet)

This enables experimentation with annotation uncertainty and quality control analysis.

Expand Down Expand Up @@ -133,7 +133,7 @@ Preparation work included uploading and parsing the sourced PDF documents in the

Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on

3 https://arxiv.org/
[3 https://arxiv.org/](https://arxiv.org/)

the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.

Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "2305.03393v1-pg9",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/2305.03393v1.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "2305.03393v1",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/amt_handbook_sample.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "amt_handbook_sample",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/code_and_formula.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "code_and_formula",
"origin": {
"mimetype": "application/pdf",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "docx_external_image",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "docx_grouped_images",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "docx_rich_cells",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/drawingml.docx.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/equations.docx.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "list_after_num_headers",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "lorem_ipsum",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/multi_page.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "multi_page",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/normal_4pages.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "normal_4pages",
"origin": {
"mimetype": "application/pdf",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "omml_frac_superscript",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/omml_func_log.docx.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "omml_func_log",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.9.0",
"version": "1.11.0",
"name": "omml_multi_equation_paragraph",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.9.0",
"version": "1.11.0",
"name": "omml_text_escapes_in_math",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "picture_classification",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/redp5110_sampled.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "redp5110_sampled",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/right_to_left_01.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "right_to_left_01",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/right_to_left_02.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "right_to_left_02",
"origin": {
"mimetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/right_to_left_03.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "right_to_left_03",
"origin": {
"mimetype": "application/pdf",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "table_with_equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/tablecell.docx.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.10.0",
"version": "1.11.0",
"name": "tablecell",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Expand Down
Loading
Loading