Add model downloader support

cau-git · cau-git · commit fc2ab41b5061 · 2026-03-16T14:53:11.000+01:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling/cli/models.py b/docling/cli/models.py
@@ -42,6 +42,7 @@ class _AvailableModels(str, Enum):
     GRANITE_CHART_EXTRACTION = "granite_chart_extraction"
     RAPIDOCR = "rapidocr"
     EASYOCR = "easyocr"
+    NEMOTRON_OCR = "nemotron_ocr"
 
 
 _default_models = [
@@ -123,6 +124,7 @@ def download(
         in to_download,
         with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
+        with_nemotron_ocr=_AvailableModels.NEMOTRON_OCR in to_download,
     )
 
     if quiet:
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -329,7 +329,8 @@ class NemotronOcrOptions(OcrOptions):
     Notes:
         Nemotron OCR does not expose runtime language selection through its public
         API. The `lang` field is kept only for compatibility with the shared OCR
-        options interface.
+        options interface. Use the pipeline-level `artifacts_path` to point to
+        pre-downloaded checkpoint artifacts.
     """
 
     kind: ClassVar[Literal["nemotron-ocr"]] = "nemotron-ocr"
@@ -342,16 +343,6 @@ class NemotronOcrOptions(OcrOptions):
             )
         ),
     ] = []
-    model_dir: Annotated[
-        Optional[Path],
-        Field(
-            description=(
-                "Optional directory containing the Nemotron OCR checkpoint files "
-                "(`detector.pth`, `recognizer.pth`, `relational.pth`, `charset.txt`). "
-                "If omitted, the upstream package downloads them from Hugging Face."
-            )
-        ),
-    ] = None
     merge_level: Annotated[
         Literal["word", "sentence", "paragraph"],
         Field(
diff --git a/docling/models/stages/ocr/nemotron_ocr_model.py b/docling/models/stages/ocr/nemotron_ocr_model.py
@@ -18,6 +18,7 @@
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
@@ -53,6 +54,8 @@ class NemotronOcrPrediction(TypedDict):
 
 
 class NemotronOcrModel(BaseOcrModel):
+    _repo_id = "nvidia/nemotron-ocr-v1"
+
     def __init__(
         self,
         enabled: bool,
@@ -81,12 +84,10 @@ def __init__(
                     "Python 3.12 and CUDA 13.x."
                 ) from exc
 
-            model_dir = (
-                str(self.options.model_dir)
-                if self.options.model_dir is not None
-                else None
+            model_dir = self._resolve_model_dir(artifacts_path=artifacts_path)
+            self.reader = NemotronOCR(
+                model_dir=None if model_dir is None else str(model_dir)
             )
-            self.reader = NemotronOCR(model_dir=model_dir)
             # Install the storage workaround only at the upstream grid-sampler
             # boundary, keeping the rest of the Nemotron integration unchanged.
             self.reader.grid_sampler = _GridSamplerStorageWorkaround(
@@ -132,6 +133,51 @@ def _validate_runtime(cls, accelerator_options: AcceleratorOptions) -> None:
                 f"reports CUDA {cuda_version!r}."
             )
 
+    @classmethod
+    def _resolve_model_dir(cls, artifacts_path: Optional[Path]) -> Optional[Path]:
+        if artifacts_path is None:
+            return None
+
+        repo_cache_folder = cls._repo_id.replace("/", "--")
+        if (artifacts_path / repo_cache_folder).exists():
+            return artifacts_path / repo_cache_folder / "checkpoints"
+
+        available_dirs = []
+        if artifacts_path.exists():
+            available_dirs = sorted(
+                path.name for path in artifacts_path.iterdir() if path.is_dir()
+            )
+
+        raise FileNotFoundError(
+            "Nemotron OCR artifacts not found in artifacts_path.\n"
+            f"Expected location: {artifacts_path / repo_cache_folder / 'checkpoints'}\n"
+            f"Available directories in {artifacts_path}: {available_dirs}\n"
+            "Use `docling-tools models download nemotron_ocr` to pre-download "
+            "the checkpoints or unset artifacts_path to allow the upstream "
+            "package to download them."
+        )
+
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        if local_dir is None:
+            local_dir = (
+                settings.cache_dir
+                / "models"
+                / NemotronOcrModel._repo_id.replace("/", "--")
+            )
+
+        local_dir.mkdir(parents=True, exist_ok=True)
+        return download_hf_model(
+            repo_id=NemotronOcrModel._repo_id,
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
+        )
+
     @staticmethod
     def _prediction_to_cell(
         prediction: NemotronOcrPrediction,
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
@@ -20,6 +20,7 @@
 from docling.models.stages.code_formula.code_formula_model import CodeFormulaModel
 from docling.models.stages.layout.layout_model import LayoutModel
 from docling.models.stages.ocr.easyocr_model import EasyOcrModel
+from docling.models.stages.ocr.nemotron_ocr_model import NemotronOcrModel
 from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
 from docling.models.stages.picture_classifier.document_picture_classifier import (
     DocumentPictureClassifier,
@@ -55,6 +56,7 @@ def download_models(
     with_granite_chart_extraction: bool = False,
     with_rapidocr: bool = True,
     with_easyocr: bool = False,
+    with_nemotron_ocr: bool = False,
 ):
     if output_dir is None:
         output_dir = settings.cache_dir / "models"
@@ -189,4 +191,12 @@ def download_models(
             progress=progress,
         )
 
+    if with_nemotron_ocr:
+        _log.info("Downloading nemotron OCR model...")
+        NemotronOcrModel.download_models(
+            local_dir=output_dir / NemotronOcrModel._repo_id.replace("/", "--"),
+            force=force,
+            progress=progress,
+        )
+
     return output_dir