docling-project
diff --git a/‎docs/examples/batch_convert.py‎
Lines changed: 45 additions & 1 deletion b/‎docs/examples/batch_convert.py‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎docs/examples/compare_vlm_models.py‎
Lines changed: 28 additions & 4 deletions b/‎docs/examples/compare_vlm_models.py‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎docs/examples/custom_convert.py‎
Lines changed: 50 additions & 13 deletions b/‎docs/examples/custom_convert.py‎
Lines changed: 50 additions & 13 deletions
diff --git a/‎docs/examples/develop_formula_understanding.py‎
Lines changed: 20 additions & 3 deletions b/‎docs/examples/develop_formula_understanding.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎docs/examples/develop_picture_enrichment.py‎
Lines changed: 19 additions & 4 deletions b/‎docs/examples/develop_picture_enrichment.py‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎docs/examples/enrich_doclingdocument.py‎
Lines changed: 24 additions & 3 deletions b/‎docs/examples/enrich_doclingdocument.py‎
Lines changed: 24 additions & 3 deletions
@@ -1,3 +1,33 @@
+"""
+Batch convert multiple PDF files and export results in several formats.
+
+What this example does
+- Loads a small set of sample PDFs.
+- Runs the Docling PDF pipeline once per file.
+- Writes outputs to `scratch/` in multiple formats (JSON, HTML, Markdown, text, doctags, YAML).
+
+Prerequisites
+- Install Docling and dependencies as described in the repository README.
+- Ensure you can import `docling` from your Python environment.
+# - YAML export requires `PyYAML` (`pip install pyyaml`).
+
+Input documents
+- By default, this example uses a few PDFs from `tests/data/pdf/` in the repo.
+- If you cloned without test data, or want to use your own files, edit
+  `input_doc_paths` below to point to PDFs on your machine.
+
+Output formats (controlled by flags)
+- `USE_V2 = True` enables the current Docling document exports (recommended).
+- `USE_LEGACY = False` keeps legacy Deep Search exports disabled.
+  You can set it to `True` if you need legacy formats for compatibility tests.
+
+Notes
+- Set `pipeline_options.generate_page_images = True` to include page images in HTML.
+- The script logs conversion progress and raises if any documents fail.
+# - This example shows both helper methods like `save_as_*` and lower-level
+#   `export_to_*` + manual file writes; outputs may overlap intentionally.
+"""
+
 import json
 import logging
 import time
@@ -15,6 +45,9 @@
 
 _log = logging.getLogger(__name__)
 
+# Export toggles:
+# - USE_V2 controls modern Docling document exports.
+# - USE_LEGACY enables legacy Deep Search exports for comparison or migration.
 USE_V2 = True
 USE_LEGACY = False
 
@@ -35,6 +68,9 @@ def export_documents(
             doc_filename = conv_res.input.file.stem
 
             if USE_V2:
+                # Recommended modern Docling exports. These helpers mirror the
+                # lower-level "export_to_*" methods used below, but handle
+                # common details like image handling.
                 conv_res.document.save_as_json(
                     output_dir / f"{doc_filename}.json",
                     image_mode=ImageRefMode.PLACEHOLDER,
@@ -121,6 +157,9 @@ def export_documents(
 def main():
     logging.basicConfig(level=logging.INFO)
 
+    # Location of sample PDFs used by this example. If your checkout does not
+    # include test data, change `data_folder` or point `input_doc_paths` to
+    # your own files.
     data_folder = Path(__file__).parent / "../../tests/data"
     input_doc_paths = [
         data_folder / "pdf/2206.01062.pdf",
@@ -139,6 +178,8 @@ def main():
     # settings.debug.visualize_tables = True
     # settings.debug.visualize_cells = True
 
+    # Configure the PDF pipeline. Enabling page image generation improves HTML
+    # previews (embedded images) but adds processing time.
     pipeline_options = PdfPipelineOptions()
     pipeline_options.generate_page_images = True
 
@@ -152,11 +193,14 @@ def main():
 
     start_time = time.time()
 
+    # Convert all inputs. Set `raises_on_error=False` to keep processing other
+    # files even if one fails; errors are summarized after the run.
     conv_results = doc_converter.convert_all(
         input_doc_paths,
         raises_on_error=False,  # to let conversion run through all and examine results at the end
     )
-    success_count, partial_success_count, failure_count = export_documents(
+    # Write outputs to ./scratch and log a summary.
+    _success_count, _partial_success_count, failure_count = export_documents(
         conv_results, output_dir=Path("scratch")
     )
 
 
@@ -1,8 +1,28 @@
-# Compare VLM models
-# ==================
+# %% [markdown]
+# Compare different VLM models by running the VLM pipeline and timing outputs.
 #
-# This example runs the VLM pipeline with different vision-language models.
-# Their runtime as well output quality is compared.
+# What this example does
+# - Iterates through a list of VLM model configurations and converts the same file.
+# - Prints per-page generation times and saves JSON/MD/HTML to `scratch/`.
+# - Summarizes total inference time and pages processed in a table.
+#
+# Requirements
+# - Install `tabulate` for pretty printing (`pip install tabulate`).
+#
+# Prerequisites
+# - Install Docling with VLM extras. Ensure models can be downloaded or are available.
+#
+# How to run
+# - From the repo root: `python docs/examples/compare_vlm_models.py`.
+# - Results are saved to `scratch/` with filenames including the model and framework.
+#
+# Notes
+# - MLX models are skipped automatically on non-macOS platforms.
+# - On CUDA systems, you can enable flash_attention_2 (see commented lines).
+# - Running multiple VLMs can be GPU/CPU intensive and time-consuming; ensure
+#   enough VRAM/system RAM and close other memory-heavy apps.
+
+# %%
 
 import json
 import sys
@@ -31,6 +51,8 @@
 
 
 def convert(sources: list[Path], converter: DocumentConverter):
+    # Note: this helper assumes a single-item `sources` list. It returns after
+    # processing the first source to keep runtime/output focused.
     model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
     framework = pipeline_options.vlm_options.inference_framework
     for source in sources:
@@ -61,6 +83,8 @@ def convert(sources: list[Path], converter: DocumentConverter):
 
         print("===== Final output of the converted document =======")
 
+        # Manual export for illustration. Below, `save_as_json()` writes the same
+        # JSON again; kept intentionally to show both approaches.
         with (out_path / f"{fname}.json").open("w") as fp:
             fp.write(json.dumps(res.document.export_to_dict()))
 
 
@@ -1,3 +1,39 @@
+# %% [markdown]
+# Customize PDF conversion by toggling OCR/backends and pipeline options.
+#
+# What this example does
+# - Shows several alternative configurations for the Docling PDF pipeline.
+# - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
+# - Converts a single sample PDF and exports results to `scratch/`.
+#
+# Prerequisites
+# - Install Docling and its optional OCR backends per the docs.
+# - Ensure you can import `docling` from your Python environment.
+#
+# How to run
+# - From the repository root, run: `python docs/examples/custom_convert.py`.
+# - Outputs are written under `scratch/` next to where you run the script.
+#
+# Choosing a configuration
+# - Only one configuration block should be active at a time.
+# - Uncomment exactly one of the sections below to experiment.
+# - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
+# - If you uncomment a backend or OCR option that is not imported above, also
+#   import its class, e.g.:
+#   - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
+#   - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
+#
+# Input document
+# - Defaults to a single PDF from `tests/data/pdf/` in the repo.
+# - If you don't have the test data, update `input_doc_path` to a local PDF.
+#
+# Notes
+# - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
+# - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
+# - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
+
+# %%
+
 import json
 import logging
 import time
@@ -21,9 +57,8 @@ def main():
 
     ###########################################################################
 
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
+    # The sections below demo combinations of PdfPipelineOptions and backends.
+    # Tip: Uncomment exactly one section at a time to compare outputs.
 
     # PyPdfium without EasyOCR
     # --------------------
@@ -68,8 +103,10 @@ def main():
     #     }
     # )
 
-    # Docling Parse with EasyOCR
-    # ----------------------
+    # Docling Parse with EasyOCR (default)
+    # -------------------------------
+    # Enables OCR and table structure with EasyOCR, using automatic device
+    # selection via AcceleratorOptions. Adjust languages as needed.
     pipeline_options = PdfPipelineOptions()
     pipeline_options.do_ocr = True
     pipeline_options.do_table_structure = True
@@ -86,7 +123,7 @@ def main():
     )
 
     # Docling Parse with EasyOCR (CPU only)
-    # ----------------------
+    # -------------------------------------
     # pipeline_options = PdfPipelineOptions()
     # pipeline_options.do_ocr = True
     # pipeline_options.ocr_options.use_gpu = False  # <-- set this.
@@ -100,7 +137,7 @@ def main():
     # )
 
     # Docling Parse with Tesseract
-    # ----------------------
+    # ----------------------------
     # pipeline_options = PdfPipelineOptions()
     # pipeline_options.do_ocr = True
     # pipeline_options.do_table_structure = True
@@ -114,7 +151,7 @@ def main():
     # )
 
     # Docling Parse with Tesseract CLI
-    # ----------------------
+    # --------------------------------
     # pipeline_options = PdfPipelineOptions()
     # pipeline_options.do_ocr = True
     # pipeline_options.do_table_structure = True
@@ -127,8 +164,8 @@ def main():
     #     }
     # )
 
-    # Docling Parse with ocrmac(Mac only)
-    # ----------------------
+    # Docling Parse with ocrmac (macOS only)
+    # --------------------------------------
     # pipeline_options = PdfPipelineOptions()
     # pipeline_options.do_ocr = True
     # pipeline_options.do_table_structure = True
@@ -154,13 +191,13 @@ def main():
     output_dir.mkdir(parents=True, exist_ok=True)
     doc_filename = conv_result.input.file.stem
 
-    # Export Deep Search document JSON format:
+    # Export Docling document JSON format:
     with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
         fp.write(json.dumps(conv_result.document.export_to_dict()))
 
-    # Export Text format:
+    # Export Text format (plain text via Markdown export):
     with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-        fp.write(conv_result.document.export_to_text())
+        fp.write(conv_result.document.export_to_markdown(strict_text=True))
 
     # Export Markdown format:
     with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
 
@@ -1,6 +1,21 @@
-# WARNING
-# This example demonstrates only how to develop a new enrichment model.
-# It does not run the actual formula understanding model.
+# %% [markdown]
+# Developing an enrichment model example (formula understanding: scaffold only).
+#
+# What this example does
+# - Shows how to define pipeline options, an enrichment model, and extend a pipeline.
+# - Displays cropped images of formula items and yields them back unchanged.
+#
+# Important
+# - This is a development scaffold; it does not run a real formula understanding model.
+#
+# How to run
+# - From the repo root: `python docs/examples/develop_formula_understanding.py`.
+#
+# Notes
+# - Set `do_formula_understanding=True` to enable the example enrichment stage.
+# - Extends `StandardPdfPipeline` and keeps the backend when enrichment is enabled.
+
+# %%
 
 import logging
 from collections.abc import Iterable
@@ -42,6 +57,8 @@ def __call__(
             return
 
         for enrich_element in element_batch:
+            # Opens a window for each cropped formula image; comment this out when
+            # running headless or processing many items to avoid blocking spam.
             enrich_element.image.show()
 
             yield enrich_element.item
 
@@ -1,6 +1,21 @@
-# WARNING
-# This example demonstrates only how to develop a new enrichment model.
-# It does not run the actual picture classifier model.
+# %% [markdown]
+# Developing a picture enrichment model (classifier scaffold only).
+#
+# What this example does
+# - Demonstrates how to implement an enrichment model that annotates pictures.
+# - Adds a dummy PictureClassificationData entry to each PictureItem.
+#
+# Important
+# - This is a scaffold for development; it does not run a real classifier.
+#
+# How to run
+# - From the repo root: `python docs/examples/develop_picture_enrichment.py`.
+#
+# Notes
+# - Enables picture image generation and sets `images_scale` to improve crops.
+# - Extends `StandardPdfPipeline` with a custom enrichment stage.
+
+# %%
 
 import logging
 from collections.abc import Iterable
@@ -43,7 +58,7 @@ def __call__(
             assert isinstance(element, PictureItem)
 
             # uncomment this to interactively visualize the image
-            # element.get_image(doc).show()
+            # element.get_image(doc).show()  # may block; avoid in headless runs
 
             element.annotations.append(
                 PictureClassificationData(
 
@@ -1,6 +1,26 @@
-## Enrich DoclingDocument
-# This example allows to run Docling enrichment models on documents which have been already converted
-# and stored as serialized DoclingDocument JSON files.
+# %% [markdown]
+# Enrich an existing DoclingDocument JSON with a custom model (post-conversion).
+#
+# What this example does
+# - Loads a previously converted DoclingDocument from JSON (no reconversion).
+# - Uses a backend to crop images for items and runs an enrichment model in batches.
+# - Prints a few example annotations to stdout.
+#
+# Prerequisites
+# - A DoclingDocument JSON produced by another conversion (path configured below).
+# - Install Docling and dependencies for the chosen enrichment model.
+# - Ensure the JSON and the referenced PDF match (same document/version), so
+#   provenance bounding boxes line up for accurate cropping.
+#
+# How to run
+# - From the repo root: `python docs/examples/enrich_doclingdocument.py`.
+# - Adjust `input_doc_path` and `input_pdf_path` if your data is elsewhere.
+#
+# Notes
+# - `BATCH_SIZE` controls how many elements are passed to the model at once.
+# - `prepare_element()` crops context around elements based on the model's expansion.
+
+# %%
 
 ### Load modules
 
@@ -24,6 +44,7 @@
 ### Define batch size used for processing
 
 BATCH_SIZE = 4
+# Trade-off: larger batches improve throughput but increase memory usage.
 
 ### From DocItem to the model inputs
 # The following function is responsible for taking an item and applying the required pre-processing for the model.