docling-project · hussainarslan · Mar 14, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 20, 2026
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Type, Union
 
 import numpy as np
@@ -20,6 +21,7 @@
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
 from pydantic import (
+    AnyUrl,
     BaseModel,
     ConfigDict,
     Field,
@@ -251,6 +253,7 @@ class TableStructurePrediction(BaseModel):
 
 class TextElement(BasePageElement):
     text: str
+    hyperlink: Optional[Union[AnyUrl, Path]] = None
 
 
 class FigureElement(BasePageElement):

diff --git a/docling/models/stages/page_assemble/page_assemble_model.py b/docling/models/stages/page_assemble/page_assemble_model.py
@@ -1,10 +1,12 @@
 import logging
 import re
 from collections.abc import Iterable
-from typing import Dict, List
+from pathlib import Path
+from typing import Dict, List, Optional, Union
 
 import numpy as np
-from pydantic import BaseModel
+from docling_core.types.doc import BoundingBox
+from pydantic import AnyUrl, BaseModel, ValidationError
 
 from docling.datamodel.base_models import (
     AssembledUnit,
@@ -43,9 +45,56 @@ class PageAssembleOptions(BaseModel):
 
 
 class PageAssembleModel(BasePageModel):
+    # Minimum fraction of a cluster's area that a hyperlink rect must cover
+    # to be considered a match (avoids false positives from adjacent links).
+    _HYPERLINK_COVERAGE_THRESHOLD = 0.5
+
     def __init__(self, options: PageAssembleOptions):
         self.options = options
 
+    @staticmethod
+    def _match_hyperlink(
+        cluster_bbox: BoundingBox,
+        page: Page,
+    ) -> Optional[Union[AnyUrl, Path]]:
+        """Pick the hyperlink annotation with the highest spatial overlap on cluster_bbox.
+
+        Hyperlink rects are BOTTOMLEFT-origin; cluster bboxes are TOPLEFT-origin.
+        """
+        if page.parsed_page is None or not page.parsed_page.hyperlinks:
+            return None
+
+        if page.size is None:
+            return None
+
+        page_height = page.size.height
+
+        # Accumulate coverage per URI — a single hyperlink may span multiple
+        # annotation rectangles (e.g. a URL that wraps across lines).
+        coverage_by_uri: Dict[str, float] = {}
+
+        for hl in page.parsed_page.hyperlinks:
+            if hl.uri is None:
+                continue
+
+            uri_str = str(hl.uri)
+            hl_bbox = hl.rect.to_bounding_box().to_top_left_origin(page_height)
+            coverage_by_uri[uri_str] = coverage_by_uri.get(
+                uri_str, 0.0
+            ) + cluster_bbox.intersection_over_self(hl_bbox)
+
+        if not coverage_by_uri:
+            return None
+
+        best_uri = max(coverage_by_uri.items(), key=lambda x: x[1])[0]
+        if coverage_by_uri[best_uri] < PageAssembleModel._HYPERLINK_COVERAGE_THRESHOLD:
+            return None
+
+        try:
+            return AnyUrl(best_uri)
+        except ValidationError:
+            return Path(best_uri)
+
     def sanitize_text(self, lines):
         if len(lines) == 0:
             return ""
@@ -111,10 +160,12 @@ def __call__(
                                 if len(cell.text.strip()) > 0
                             ]
                             text = self.sanitize_text(textlines)
+                            hyperlink = self._match_hyperlink(cluster.bbox, page)
                             text_el = TextElement(
                                 label=cluster.label,
                                 id=cluster.id,
                                 text=text,
+                                hyperlink=hyperlink,
                                 page_no=page.page_no,
                                 cluster=cluster,
                             )

diff --git a/docling/models/stages/reading_order/readingorder_model.py b/docling/models/stages/reading_order/readingorder_model.py
@@ -347,7 +347,11 @@ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
             bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
         )
         new_item = out_doc.add_text(
-            label=elem.label, text=text, prov=prov, parent=parent
+            label=elem.label,
+            text=text,
+            prov=prov,
+            parent=parent,
+            hyperlink=elem.hyperlink,
         )
         return new_item
 
@@ -366,14 +370,20 @@ def _handle_text_element(self, element, out_doc, current_list, page_height):
 
             # TODO: Infer if this is a numbered or a bullet list item
             new_item = out_doc.add_list_item(
-                text=cap_text, enumerated=False, prov=prov, parent=current_list
+                text=cap_text,
+                enumerated=False,
+                prov=prov,
+                parent=current_list,
+                hyperlink=element.hyperlink,
             )
             self.list_item_processor.process_list_item(new_item)
 
         elif label == DocItemLabel.SECTION_HEADER:
             current_list = None
 
-            new_item = out_doc.add_heading(text=cap_text, prov=prov)
+            new_item = out_doc.add_heading(
+                text=cap_text, prov=prov, hyperlink=element.hyperlink
+            )
         elif label == DocItemLabel.FORMULA:
             current_list = None
 
@@ -392,6 +402,7 @@ def _handle_text_element(self, element, out_doc, current_list, page_height):
                 text=cap_text,
                 prov=prov,
                 content_layer=content_layer,
+                hyperlink=element.hyperlink,
             )
         return new_item, current_list
 
@@ -414,6 +425,9 @@ def _merge_elements(self, element, merged_elem, new_item, page_height):
         new_item.orig += f" {merged_elem.text}"  # TODO: This is incomplete, we don't have the `orig` field of the merged element.
         new_item.prov.append(prov)
 
+        if new_item.hyperlink != merged_elem.hyperlink:
+            new_item.hyperlink = None
+
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
         with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
             page_elements = self._assembled_to_readingorder_elements(conv_res)

diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.json b/tests/data/groundtruth/docling_v2/2203.01017v2.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "2203.01017v2",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/2206.01062.json b/tests/data/groundtruth/docling_v2/2206.01062.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "2206.01062",
   "origin": {
     "mimetype": "application/pdf",
@@ -9609,7 +9609,8 @@
         }
       ],
       "orig": "1 https://developer.ibm.com/exchanges/data/all/doclaynet",
-      "text": "1 https://developer.ibm.com/exchanges/data/all/doclaynet"
+      "text": "1 https://developer.ibm.com/exchanges/data/all/doclaynet",
+      "hyperlink": "https://developer.ibm.com/exchanges/data/all/doclaynet"
     },
     {
       "self_ref": "#/texts/335",
@@ -10789,7 +10790,8 @@
         }
       ],
       "orig": "3 https://arxiv.org/",
-      "text": "3 https://arxiv.org/"
+      "text": "3 https://arxiv.org/",
+      "hyperlink": "https://arxiv.org/"
     },
     {
       "self_ref": "#/texts/378",

diff --git a/tests/data/groundtruth/docling_v2/2206.01062.md b/tests/data/groundtruth/docling_v2/2206.01062.md
@@ -57,7 +57,7 @@ In this paper, we present the DocLayNet dataset. It provides pageby-page layout
 - (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
 - (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
 
-1 https://developer.ibm.com/exchanges/data/all/doclaynet
+[1 https://developer.ibm.com/exchanges/data/all/doclaynet](https://developer.ibm.com/exchanges/data/all/doclaynet)
 
 This enables experimentation with annotation uncertainty and quality control analysis.
 
@@ -133,7 +133,7 @@ Preparation work included uploading and parsing the sourced PDF documents in the
 
 Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on
 
-3 https://arxiv.org/
+[3 https://arxiv.org/](https://arxiv.org/)
 
 the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.
 

diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "2305.03393v1-pg9",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.json b/tests/data/groundtruth/docling_v2/2305.03393v1.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "2305.03393v1",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/amt_handbook_sample.json b/tests/data/groundtruth/docling_v2/amt_handbook_sample.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "amt_handbook_sample",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/code_and_formula.json b/tests/data/groundtruth/docling_v2/code_and_formula.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "code_and_formula",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/docx_external_image.docx.json b/tests/data/groundtruth/docling_v2/docx_external_image.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "docx_external_image",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/docx_grouped_images.docx.json b/tests/data/groundtruth/docling_v2/docx_grouped_images.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "docx_grouped_images",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "docx_rich_cells",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/drawingml.docx.json b/tests/data/groundtruth/docling_v2/drawingml.docx.json
diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "equations",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "list_after_num_headers",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "lorem_ipsum",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/multi_page.json b/tests/data/groundtruth/docling_v2/multi_page.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "multi_page",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/normal_4pages.json b/tests/data/groundtruth/docling_v2/normal_4pages.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "normal_4pages",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/omml_frac_superscript.docx.json b/tests/data/groundtruth/docling_v2/omml_frac_superscript.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "omml_frac_superscript",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/omml_func_log.docx.json b/tests/data/groundtruth/docling_v2/omml_func_log.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "omml_func_log",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.json b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.9.0",
+  "version": "1.11.0",
   "name": "omml_multi_equation_paragraph",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/omml_text_escapes_in_math.docx.json b/tests/data/groundtruth/docling_v2/omml_text_escapes_in_math.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.9.0",
+  "version": "1.11.0",
   "name": "omml_text_escapes_in_math",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/picture_classification.json b/tests/data/groundtruth/docling_v2/picture_classification.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "picture_classification",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.json b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "redp5110_sampled",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/right_to_left_01.json b/tests/data/groundtruth/docling_v2/right_to_left_01.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "right_to_left_01",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/right_to_left_02.json b/tests/data/groundtruth/docling_v2/right_to_left_02.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "right_to_left_02",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/right_to_left_03.json b/tests/data/groundtruth/docling_v2/right_to_left_03.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "right_to_left_03",
   "origin": {
     "mimetype": "application/pdf",

diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.json b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "table_with_equations",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

diff --git a/tests/data/groundtruth/docling_v2/tablecell.docx.json b/tests/data/groundtruth/docling_v2/tablecell.docx.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.10.0",
+  "version": "1.11.0",
   "name": "tablecell",
   "origin": {
     "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",