Skip to content

Commit 57c40b1

Browse files
committed
Merge branch 'main' of github.com:DS4SD/docling into cau/layout_vlm_pipeline
2 parents ed68b47 + 0700af2 commit 57c40b1

25 files changed

+1778
-446
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## [v2.51.0](https://github.com/docling-project/docling/releases/tag/v2.51.0) - 2025-09-05
2+
3+
### Feature
4+
5+
* Updating default parameters to get better performance with docling-parse ([#2208](https://github.com/docling-project/docling/issues/2208)) ([`b49d1ad`](https://github.com/docling-project/docling/commit/b49d1ad4f1af6eeadc3f8d0e35123dc52c6e228e))
6+
* Updated the backend for new docling-parse ([#2187](https://github.com/docling-project/docling/issues/2187)) ([`b3d7542`](https://github.com/docling-project/docling/commit/b3d754206172d08d6d01f29f132dcb66383f955b))
7+
8+
### Documentation
9+
10+
* Add information extraction example ([#2199](https://github.com/docling-project/docling/issues/2199)) ([`a9f41b0`](https://github.com/docling-project/docling/commit/a9f41b088eae6f1ffe34d567057f80180f445a05))
11+
112
## [v2.50.0](https://github.com/docling-project/docling/releases/tag/v2.50.0) - 2025-09-03
213

314
### Feature

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
3838
* 🔍 Extensive OCR support for scanned PDFs and images
3939
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
4040
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
41+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
4142
* 💻 Simple and convenient CLI
4243

4344
### What's new
4445
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
46+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
47+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
4548

4649
### Coming soon
4750

4851
* 📝 Metadata extraction, including title, authors, references & language
4952
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
5053
* 📝 Complex chemistry understanding (Molecular structures)
54+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
5155

5256
## Installation
5357

@@ -73,7 +77,7 @@ result = converter.convert(source)
7377
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
7478
```
7579

76-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
80+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
7781
the docs.
7882

7983
## CLI

docling/cli/main.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from docling.datamodel.document import ConversionResult
4949
from docling.datamodel.pipeline_options import (
5050
AsrPipelineOptions,
51+
ConvertPipelineOptions,
5152
EasyOcrOptions,
5253
OcrOptions,
5354
PaginatedPipelineOptions,
@@ -73,8 +74,13 @@
7374
from docling.document_converter import (
7475
AudioFormatOption,
7576
DocumentConverter,
77+
ExcelFormatOption,
7678
FormatOption,
79+
HTMLFormatOption,
80+
MarkdownFormatOption,
7781
PdfFormatOption,
82+
PowerpointFormatOption,
83+
WordFormatOption,
7884
)
7985
from docling.models.factories import get_ocr_factory
8086
from docling.pipeline.asr_pipeline import AsrPipeline
@@ -628,10 +634,33 @@ def convert( # noqa: C901
628634
backend=MetsGbsDocumentBackend,
629635
)
630636

637+
# SimplePipeline options
638+
simple_format_option = ConvertPipelineOptions(
639+
do_picture_description=enrich_picture_description,
640+
do_picture_classification=enrich_picture_classes,
641+
)
642+
if artifacts_path is not None:
643+
simple_format_option.artifacts_path = artifacts_path
644+
631645
format_options = {
632646
InputFormat.PDF: pdf_format_option,
633647
InputFormat.IMAGE: pdf_format_option,
634648
InputFormat.METS_GBS: mets_gbs_format_option,
649+
InputFormat.DOCX: WordFormatOption(
650+
pipeline_options=simple_format_option
651+
),
652+
InputFormat.PPTX: PowerpointFormatOption(
653+
pipeline_options=simple_format_option
654+
),
655+
InputFormat.XLSX: ExcelFormatOption(
656+
pipeline_options=simple_format_option
657+
),
658+
InputFormat.HTML: HTMLFormatOption(
659+
pipeline_options=simple_format_option
660+
),
661+
InputFormat.MD: MarkdownFormatOption(
662+
pipeline_options=simple_format_option
663+
),
635664
}
636665

637666
elif pipeline == ProcessingPipeline.VLM:

docling/datamodel/pipeline_options.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ class EasyOcrOptions(OcrOptions):
135135
recog_network: Optional[str] = "standard"
136136
download_enabled: bool = True
137137

138+
suppress_mps_warnings: bool = True
139+
138140
model_config = ConfigDict(
139141
extra="forbid",
140142
protected_namespaces=(),
@@ -257,11 +259,21 @@ class PipelineOptions(BaseOptions):
257259
accelerator_options: AcceleratorOptions = AcceleratorOptions()
258260
enable_remote_services: bool = False
259261
allow_external_plugins: bool = False
262+
artifacts_path: Optional[Union[Path, str]] = None
260263

261264

262-
class PaginatedPipelineOptions(PipelineOptions):
263-
artifacts_path: Optional[Union[Path, str]] = None
265+
class ConvertPipelineOptions(PipelineOptions):
266+
"""Base convert pipeline options."""
267+
268+
do_picture_classification: bool = False # True: classify pictures in documents
269+
270+
do_picture_description: bool = False # True: run describe pictures in documents
271+
picture_description_options: PictureDescriptionBaseOptions = (
272+
smolvlm_picture_description
273+
)
274+
264275

276+
class PaginatedPipelineOptions(ConvertPipelineOptions):
265277
images_scale: float = 1.0
266278
generate_page_images: bool = False
267279
generate_picture_images: bool = False
@@ -293,13 +305,11 @@ class LayoutOptions(BaseModel):
293305

294306
class AsrPipelineOptions(PipelineOptions):
295307
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
296-
artifacts_path: Optional[Union[Path, str]] = None
297308

298309

299310
class VlmExtractionPipelineOptions(PipelineOptions):
300311
"""Options for extraction pipeline."""
301312

302-
artifacts_path: Optional[Union[Path, str]] = None
303313
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
304314

305315

@@ -310,18 +320,13 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
310320
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
311321
do_code_enrichment: bool = False # True: perform code OCR
312322
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
313-
do_picture_classification: bool = False # True: classify pictures in documents
314-
do_picture_description: bool = False # True: run describe pictures in documents
315323
force_backend_text: bool = (
316324
False # (To be used with vlms, or other generative models)
317325
)
318326
# If True, text from backend will be used instead of generated text
319327

320328
table_structure_options: TableStructureOptions = TableStructureOptions()
321329
ocr_options: OcrOptions = EasyOcrOptions()
322-
picture_description_options: PictureDescriptionBaseOptions = (
323-
smolvlm_picture_description
324-
)
325330
layout_options: LayoutOptions = LayoutOptions()
326331

327332
images_scale: float = 1.0

docling/models/base_model.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@
44
from typing import Any, Generic, Optional, Protocol, Type, Union
55

66
import numpy as np
7-
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
7+
from docling_core.types.doc import (
8+
BoundingBox,
9+
DocItem,
10+
DoclingDocument,
11+
NodeItem,
12+
PictureItem,
13+
)
814
from PIL.Image import Image
915
from typing_extensions import TypeVar
1016

@@ -164,8 +170,17 @@ def prepare_element(
164170
return None
165171

166172
assert isinstance(element, DocItem)
167-
element_prov = element.prov[0]
168173

174+
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
175+
if len(element.prov) == 0 and isinstance(element, PictureItem):
176+
embedded_im = element.get_image(conv_res.document)
177+
if embedded_im is not None:
178+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
179+
else:
180+
return None
181+
182+
# Crop the image form the page
183+
element_prov = element.prov[0]
169184
bbox = element_prov.bbox
170185
width = bbox.r - bbox.l
171186
height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ def prepare_element(
183198
cropped_image = conv_res.pages[page_ix].get_image(
184199
scale=self.images_scale, cropbox=expanded_bbox
185200
)
201+
202+
# Allow for images being embedded without the page backend or page images
203+
if cropped_image is None and isinstance(element, PictureItem):
204+
embedded_im = element.get_image(conv_res.document)
205+
if embedded_im is not None:
206+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
207+
else:
208+
return None
209+
210+
# Return the proper cropped image
186211
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

docling/models/easyocr_model.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,17 @@ def __init__(
7878
download_enabled = False
7979
model_storage_directory = str(artifacts_path / self._model_repo_folder)
8080

81-
self.reader = easyocr.Reader(
82-
lang_list=self.options.lang,
83-
gpu=use_gpu,
84-
model_storage_directory=model_storage_directory,
85-
recog_network=self.options.recog_network,
86-
download_enabled=download_enabled,
87-
verbose=False,
88-
)
81+
with warnings.catch_warnings():
82+
if self.options.suppress_mps_warnings:
83+
warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
84+
self.reader = easyocr.Reader(
85+
lang_list=self.options.lang,
86+
gpu=use_gpu,
87+
model_storage_directory=model_storage_directory,
88+
recog_network=self.options.recog_network,
89+
download_enabled=download_enabled,
90+
verbose=False,
91+
)
8992

9093
@staticmethod
9194
def download_models(
@@ -147,7 +150,14 @@ def __call__(
147150
scale=self.scale, cropbox=ocr_rect
148151
)
149152
im = numpy.array(high_res_image)
150-
result = self.reader.readtext(im)
153+
154+
with warnings.catch_warnings():
155+
if self.options.suppress_mps_warnings:
156+
warnings.filterwarnings(
157+
"ignore", message=".*pin_memory.*MPS.*"
158+
)
159+
160+
result = self.reader.readtext(im)
151161

152162
del high_res_image
153163
del im

docling/models/picture_description_vlm_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def __init__(
6767
self.model = AutoModelForImageTextToText.from_pretrained(
6868
artifacts_path,
6969
device_map=self.device,
70-
torch_dtype=torch.bfloat16,
70+
dtype=torch.bfloat16,
7171
_attn_implementation=(
7272
"flash_attention_2"
7373
if self.device.startswith("cuda")

docling/models/vlm_models_inline/hf_transformers_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def __init__(
112112
self.vlm_model = model_cls.from_pretrained(
113113
artifacts_path,
114114
device_map=self.device,
115-
torch_dtype=self.vlm_options.torch_dtype,
115+
dtype=self.vlm_options.torch_dtype,
116116
_attn_implementation=(
117117
"flash_attention_2"
118118
if self.device.startswith("cuda")

docling/models/vlm_models_inline/nuextract_transformers_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def __init__(
144144
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
145145
artifacts_path,
146146
device_map=self.device,
147-
torch_dtype=self.vlm_options.torch_dtype,
147+
dtype=self.vlm_options.torch_dtype,
148148
_attn_implementation=(
149149
"flash_attention_2"
150150
if self.device.startswith("cuda")

docling/pipeline/asr_pipeline.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -208,25 +208,13 @@ def __init__(self, pipeline_options: AsrPipelineOptions):
208208

209209
self.pipeline_options: AsrPipelineOptions = pipeline_options
210210

211-
artifacts_path: Optional[Path] = None
212-
if pipeline_options.artifacts_path is not None:
213-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
214-
elif settings.artifacts_path is not None:
215-
artifacts_path = Path(settings.artifacts_path).expanduser()
216-
217-
if artifacts_path is not None and not artifacts_path.is_dir():
218-
raise RuntimeError(
219-
f"The value of {artifacts_path=} is not valid. "
220-
"When defined, it must point to a folder containing all models required by the pipeline."
221-
)
222-
223211
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
224212
asr_options: InlineAsrNativeWhisperOptions = (
225213
self.pipeline_options.asr_options
226214
)
227215
self._model = _NativeWhisperModel(
228216
enabled=True, # must be always enabled for this pipeline to make sense.
229-
artifacts_path=artifacts_path,
217+
artifacts_path=self.artifacts_path,
230218
accelerator_options=pipeline_options.accelerator_options,
231219
asr_options=asr_options,
232220
)

0 commit comments

Comments
 (0)