docling-project
diff --git a/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 1 deletion b/‎README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docling/cli/main.py‎
Lines changed: 29 additions & 0 deletions b/‎docling/cli/main.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docling/datamodel/pipeline_options.py‎
Lines changed: 14 additions & 9 deletions b/‎docling/datamodel/pipeline_options.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎docling/models/base_model.py‎
Lines changed: 27 additions & 2 deletions b/‎docling/models/base_model.py‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎docling/models/easyocr_model.py‎
Lines changed: 19 additions & 9 deletions b/‎docling/models/easyocr_model.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎docling/models/picture_description_vlm_model.py‎
Lines changed: 1 addition & 1 deletion b/‎docling/models/picture_description_vlm_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling/models/vlm_models_inline/hf_transformers_model.py‎
Lines changed: 1 addition & 1 deletion b/‎docling/models/vlm_models_inline/hf_transformers_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling/models/vlm_models_inline/nuextract_transformers_model.py‎
Lines changed: 1 addition & 1 deletion b/‎docling/models/vlm_models_inline/nuextract_transformers_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling/pipeline/asr_pipeline.py‎
Lines changed: 1 addition & 13 deletions b/‎docling/pipeline/asr_pipeline.py‎
Lines changed: 1 addition & 13 deletions
@@ -1,3 +1,14 @@
+## [v2.51.0](https://github.com/docling-project/docling/releases/tag/v2.51.0) - 2025-09-05
+
+### Feature
+
+* Updating default parameters to get better performance with docling-parse ([#2208](https://github.com/docling-project/docling/issues/2208)) ([`b49d1ad`](https://github.com/docling-project/docling/commit/b49d1ad4f1af6eeadc3f8d0e35123dc52c6e228e))
+* Updated the backend for new docling-parse ([#2187](https://github.com/docling-project/docling/issues/2187)) ([`b3d7542`](https://github.com/docling-project/docling/commit/b3d754206172d08d6d01f29f132dcb66383f955b))
+
+### Documentation
+
+* Add information extraction example ([#2199](https://github.com/docling-project/docling/issues/2199)) ([`a9f41b0`](https://github.com/docling-project/docling/commit/a9f41b088eae6f1ffe34d567057f80180f445a05))
+
 ## [v2.50.0](https://github.com/docling-project/docling/releases/tag/v2.50.0) - 2025-09-03
 
 ### Feature
 
@@ -38,16 +38,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔍 Extensive OCR support for scanned PDFs and images
 * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
 * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
+* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
 * 💻 Simple and convenient CLI
 
 ### What's new
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
+* 📑 New layout model (**Heron**) by default, for faster PDF parsing
+* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
 
 ### Coming soon
 
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
+* 📝 Parsing of Web Video Text Tracks (WebVTT) files
 
 ## Installation
 
@@ -73,7 +77,7 @@ result = converter.convert(source)
 print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
 ```
 
-More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
+More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
 the docs.
 
 ## CLI
 
@@ -48,6 +48,7 @@
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
+    ConvertPipelineOptions,
     EasyOcrOptions,
     OcrOptions,
     PaginatedPipelineOptions,
@@ -73,8 +74,13 @@
 from docling.document_converter import (
     AudioFormatOption,
     DocumentConverter,
+    ExcelFormatOption,
     FormatOption,
+    HTMLFormatOption,
+    MarkdownFormatOption,
     PdfFormatOption,
+    PowerpointFormatOption,
+    WordFormatOption,
 )
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.asr_pipeline import AsrPipeline
@@ -628,10 +634,33 @@ def convert(  # noqa: C901
                 backend=MetsGbsDocumentBackend,
             )
 
+            # SimplePipeline options
+            simple_format_option = ConvertPipelineOptions(
+                do_picture_description=enrich_picture_description,
+                do_picture_classification=enrich_picture_classes,
+            )
+            if artifacts_path is not None:
+                simple_format_option.artifacts_path = artifacts_path
+
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
                 InputFormat.METS_GBS: mets_gbs_format_option,
+                InputFormat.DOCX: WordFormatOption(
+                    pipeline_options=simple_format_option
+                ),
+                InputFormat.PPTX: PowerpointFormatOption(
+                    pipeline_options=simple_format_option
+                ),
+                InputFormat.XLSX: ExcelFormatOption(
+                    pipeline_options=simple_format_option
+                ),
+                InputFormat.HTML: HTMLFormatOption(
+                    pipeline_options=simple_format_option
+                ),
+                InputFormat.MD: MarkdownFormatOption(
+                    pipeline_options=simple_format_option
+                ),
             }
 
         elif pipeline == ProcessingPipeline.VLM:
 
@@ -135,6 +135,8 @@ class EasyOcrOptions(OcrOptions):
     recog_network: Optional[str] = "standard"
     download_enabled: bool = True
 
+    suppress_mps_warnings: bool = True
+
     model_config = ConfigDict(
         extra="forbid",
         protected_namespaces=(),
@@ -257,11 +259,21 @@ class PipelineOptions(BaseOptions):
     accelerator_options: AcceleratorOptions = AcceleratorOptions()
     enable_remote_services: bool = False
     allow_external_plugins: bool = False
+    artifacts_path: Optional[Union[Path, str]] = None
 
 
-class PaginatedPipelineOptions(PipelineOptions):
-    artifacts_path: Optional[Union[Path, str]] = None
+class ConvertPipelineOptions(PipelineOptions):
+    """Base convert pipeline options."""
+
+    do_picture_classification: bool = False  # True: classify pictures in documents
+
+    do_picture_description: bool = False  # True: run describe pictures in documents
+    picture_description_options: PictureDescriptionBaseOptions = (
+        smolvlm_picture_description
+    )
+
 
+class PaginatedPipelineOptions(ConvertPipelineOptions):
     images_scale: float = 1.0
     generate_page_images: bool = False
     generate_picture_images: bool = False
@@ -293,13 +305,11 @@ class LayoutOptions(BaseModel):
 
 class AsrPipelineOptions(PipelineOptions):
     asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
-    artifacts_path: Optional[Union[Path, str]] = None
 
 
 class VlmExtractionPipelineOptions(PipelineOptions):
     """Options for extraction pipeline."""
 
-    artifacts_path: Optional[Union[Path, str]] = None
     vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
 
 
@@ -310,18 +320,13 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
     do_code_enrichment: bool = False  # True: perform code OCR
     do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
-    do_picture_classification: bool = False  # True: classify pictures in documents
-    do_picture_description: bool = False  # True: run describe pictures in documents
     force_backend_text: bool = (
         False  # (To be used with vlms, or other generative models)
     )
     # If True, text from backend will be used instead of generated text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: OcrOptions = EasyOcrOptions()
-    picture_description_options: PictureDescriptionBaseOptions = (
-        smolvlm_picture_description
-    )
     layout_options: LayoutOptions = LayoutOptions()
 
     images_scale: float = 1.0
 
@@ -4,7 +4,13 @@
 from typing import Any, Generic, Optional, Protocol, Type, Union
 
 import numpy as np
-from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItem,
+    DoclingDocument,
+    NodeItem,
+    PictureItem,
+)
 from PIL.Image import Image
 from typing_extensions import TypeVar
 
@@ -164,8 +170,17 @@ def prepare_element(
             return None
 
         assert isinstance(element, DocItem)
-        element_prov = element.prov[0]
 
+        # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
+        if len(element.prov) == 0 and isinstance(element, PictureItem):
+            embedded_im = element.get_image(conv_res.document)
+            if embedded_im is not None:
+                return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
+            else:
+                return None
+
+        # Crop the image form the page
+        element_prov = element.prov[0]
         bbox = element_prov.bbox
         width = bbox.r - bbox.l
         height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ def prepare_element(
         cropped_image = conv_res.pages[page_ix].get_image(
             scale=self.images_scale, cropbox=expanded_bbox
         )
+
+        # Allow for images being embedded without the page backend or page images
+        if cropped_image is None and isinstance(element, PictureItem):
+            embedded_im = element.get_image(conv_res.document)
+            if embedded_im is not None:
+                return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
+            else:
+                return None
+
+        # Return the proper cropped image
         return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
@@ -78,14 +78,17 @@ def __init__(
                 download_enabled = False
                 model_storage_directory = str(artifacts_path / self._model_repo_folder)
 
-            self.reader = easyocr.Reader(
-                lang_list=self.options.lang,
-                gpu=use_gpu,
-                model_storage_directory=model_storage_directory,
-                recog_network=self.options.recog_network,
-                download_enabled=download_enabled,
-                verbose=False,
-            )
+            with warnings.catch_warnings():
+                if self.options.suppress_mps_warnings:
+                    warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
+                self.reader = easyocr.Reader(
+                    lang_list=self.options.lang,
+                    gpu=use_gpu,
+                    model_storage_directory=model_storage_directory,
+                    recog_network=self.options.recog_network,
+                    download_enabled=download_enabled,
+                    verbose=False,
+                )
 
     @staticmethod
     def download_models(
@@ -147,7 +150,14 @@ def __call__(
                             scale=self.scale, cropbox=ocr_rect
                         )
                         im = numpy.array(high_res_image)
-                        result = self.reader.readtext(im)
+
+                        with warnings.catch_warnings():
+                            if self.options.suppress_mps_warnings:
+                                warnings.filterwarnings(
+                                    "ignore", message=".*pin_memory.*MPS.*"
+                                )
+
+                            result = self.reader.readtext(im)
 
                         del high_res_image
                         del im
 
@@ -67,7 +67,7 @@ def __init__(
                 self.model = AutoModelForImageTextToText.from_pretrained(
                     artifacts_path,
                     device_map=self.device,
-                    torch_dtype=torch.bfloat16,
+                    dtype=torch.bfloat16,
                     _attn_implementation=(
                         "flash_attention_2"
                         if self.device.startswith("cuda")
 
@@ -112,7 +112,7 @@ def __init__(
             self.vlm_model = model_cls.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
-                torch_dtype=self.vlm_options.torch_dtype,
+                dtype=self.vlm_options.torch_dtype,
                 _attn_implementation=(
                     "flash_attention_2"
                     if self.device.startswith("cuda")
 
@@ -144,7 +144,7 @@ def __init__(
             self.vlm_model = AutoModelForImageTextToText.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
-                torch_dtype=self.vlm_options.torch_dtype,
+                dtype=self.vlm_options.torch_dtype,
                 _attn_implementation=(
                     "flash_attention_2"
                     if self.device.startswith("cuda")
 
@@ -208,25 +208,13 @@ def __init__(self, pipeline_options: AsrPipelineOptions):
 
         self.pipeline_options: AsrPipelineOptions = pipeline_options
 
-        artifacts_path: Optional[Path] = None
-        if pipeline_options.artifacts_path is not None:
-            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
-        elif settings.artifacts_path is not None:
-            artifacts_path = Path(settings.artifacts_path).expanduser()
-
-        if artifacts_path is not None and not artifacts_path.is_dir():
-            raise RuntimeError(
-                f"The value of {artifacts_path=} is not valid. "
-                "When defined, it must point to a folder containing all models required by the pipeline."
-            )
-
         if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
             asr_options: InlineAsrNativeWhisperOptions = (
                 self.pipeline_options.asr_options
             )
             self._model = _NativeWhisperModel(
                 enabled=True,  # must be always enabled for this pipeline to make sense.
-                artifacts_path=artifacts_path,
+                artifacts_path=self.artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
                 asr_options=asr_options,
             )