feat: Add NVIDIA nemotron-ocr as supporetd backend

cau-git · cau-git · commit f8cf4b002540 · 2026-03-16T13:16:02.000+01:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -50,7 +50,7 @@ jobs:
             pre-commit|${{ env.PY }}|
 
       - name: Install Python Dependencies
-        run: uv sync --frozen --all-extras
+        run: uv sync --frozen --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
 
       - name: Check style
         run: |
@@ -92,7 +92,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install Python Dependencies
-        run: uv sync --frozen --all-extras
+        run: uv sync --frozen --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
 
       - name: Cache Models
         uses: actions/cache@v5
@@ -159,7 +159,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install Python Dependencies
-        run: uv sync --frozen --all-extras
+        run: uv sync --frozen --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
 
       - name: Cache Models
         uses: actions/cache@v5
@@ -231,7 +231,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install Python Dependencies
-        run: uv sync --frozen --all-extras
+        run: uv sync --frozen --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
 
       - name: Cache Models
         uses: actions/cache@v5
@@ -410,7 +410,7 @@ jobs:
           enable-cache: true
 
       - name: Install dependencies
-        run: uv sync --all-extras
+        run: uv sync --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
 
       - name: Build package
         run: uv build
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -29,7 +29,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           enable-cache: true
       - name: Install dependencies
-        run: uv sync --all-extras
+        run: uv sync --extra easyocr --extra tesserocr --extra ocrmac --extra rapidocr --extra vlm --extra asr --extra xbrl --extra remote-serving
       - name: Build package
         run: uv build
       - name: Publish distribution 📦 to PyPI
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -158,7 +158,8 @@ class OcrOptions(BaseOptions):
     See Also:
         `OcrAutoOptions`: Automatic engine selection based on availability.
         `EasyOcrOptions`, `TesseractCliOcrOptions`, `TesseractOcrOptions`,
-        `RapidOcrOptions`, `OcrMacOptions`: Engine-specific configurations.
+        `RapidOcrOptions`, `OcrMacOptions`, `NemotronOcrOptions`: Engine-specific
+        configurations.
     """
 
     lang: Annotated[
@@ -322,6 +323,49 @@ class RapidOcrOptions(OcrOptions):
     )
 
 
+class NemotronOcrOptions(OcrOptions):
+    """Configuration for NVIDIA Nemotron OCR.
+
+    Notes:
+        Nemotron OCR does not expose runtime language selection through its public
+        API. The `lang` field is kept only for compatibility with the shared OCR
+        options interface.
+    """
+
+    kind: ClassVar[Literal["nemotron-ocr"]] = "nemotron-ocr"
+    lang: Annotated[
+        list[str],
+        Field(
+            description=(
+                "Reserved for interface compatibility. Nemotron OCR does not expose "
+                "runtime language selection through its public API."
+            )
+        ),
+    ] = []
+    model_dir: Annotated[
+        Optional[Path],
+        Field(
+            description=(
+                "Optional directory containing the Nemotron OCR checkpoint files "
+                "(`detector.pth`, `recognizer.pth`, `relational.pth`, `charset.txt`). "
+                "If omitted, the upstream package downloads them from Hugging Face."
+            )
+        ),
+    ] = None
+    merge_level: Annotated[
+        Literal["word", "sentence", "paragraph"],
+        Field(
+            description=(
+                "Granularity requested from Nemotron OCR. `word` is the default "
+                "because it maps most directly to Docling OCR cells."
+            )
+        ),
+    ] = "word"
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class EasyOcrOptions(OcrOptions):
     """Configuration for EasyOCR engine."""
 
diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py
@@ -1,6 +1,7 @@
 def ocr_engines():
     from docling.models.stages.ocr.auto_ocr_model import OcrAutoModel
     from docling.models.stages.ocr.easyocr_model import EasyOcrModel
+    from docling.models.stages.ocr.nemotron_ocr_model import NemotronOcrModel
     from docling.models.stages.ocr.ocr_mac_model import OcrMacModel
     from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
     from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractOcrCliModel
@@ -10,6 +11,7 @@ def ocr_engines():
         "ocr_engines": [
             OcrAutoModel,
             EasyOcrModel,
+            NemotronOcrModel,
             OcrMacModel,
             RapidOcrModel,
             TesseractOcrModel,
diff --git a/docling/models/stages/ocr/nemotron_ocr_model.py b/docling/models/stages/ocr/nemotron_ocr_model.py
@@ -0,0 +1,208 @@
+import logging
+import platform
+import sys
+from collections.abc import Iterable, Sequence
+from pathlib import Path
+from typing import Optional, Type, TypedDict, cast
+
+import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    NemotronOcrOptions,
+    OcrOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class NemotronOcrPrediction(TypedDict):
+    """Exact prediction schema returned by `nemotron_ocr` 1.0.1."""
+
+    text: str
+    confidence: float
+    left: float
+    upper: float
+    right: float
+    lower: float
+
+
+class NemotronOcrModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: NemotronOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: NemotronOcrOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            self._validate_runtime(accelerator_options=accelerator_options)
+
+            try:
+                from nemotron_ocr.inference.pipeline import NemotronOCR
+            except ImportError as exc:
+                raise ImportError(
+                    "Nemotron OCR is not installed. Install the optional dependency "
+                    'via `pip install "docling[nemotron-ocr]"` on Linux x86_64 with '
+                    "Python 3.12 and CUDA 13.x."
+                ) from exc
+
+            model_dir = (
+                str(self.options.model_dir)
+                if self.options.model_dir is not None
+                else None
+            )
+            self.reader = NemotronOCR(model_dir=model_dir)
+
+    @staticmethod
+    def _fail_runtime(message: str) -> None:
+        _log.warning(message)
+        raise RuntimeError(message)
+
+    @classmethod
+    def _validate_runtime(cls, accelerator_options: AcceleratorOptions) -> None:
+        if sys.platform != "linux":
+            cls._fail_runtime("Nemotron OCR is only supported on Linux.")
+
+        if platform.machine() != "x86_64":
+            cls._fail_runtime("Nemotron OCR is only supported on x86_64 machines.")
+
+        if sys.version_info[:2] != (3, 12):
+            cls._fail_runtime("Nemotron OCR requires Python 3.12.")
+
+        requested_device = decide_device(accelerator_options.device)
+        if not requested_device.startswith("cuda"):
+            cls._fail_runtime(
+                "Nemotron OCR requires a CUDA accelerator. Set "
+                "`pipeline_options.accelerator_options.device` to CUDA or AUTO on a "
+                "CUDA-enabled machine."
+            )
+
+        import torch
+
+        if not torch.cuda.is_available():
+            cls._fail_runtime(
+                "Nemotron OCR requires CUDA at initialization time, but "
+                "`torch.cuda.is_available()` is false."
+            )
+
+        cuda_version = torch.version.cuda
+        if cuda_version is None or not cuda_version.startswith("13."):
+            cls._fail_runtime(
+                "Nemotron OCR requires CUDA 13.x, but the current PyTorch runtime "
+                f"reports CUDA {cuda_version!r}."
+            )
+
+    @staticmethod
+    def _prediction_to_cell(
+        prediction: NemotronOcrPrediction,
+        index: int,
+        ocr_rect: BoundingBox,
+        image_width: int,
+        image_height: int,
+        scale: int,
+    ) -> TextCell:
+        # `nemotron_ocr` 1.0.1 returns normalized `left/right` and an inverted
+        # pair `lower/upper`, where `lower` is the top Y and `upper` is the
+        # bottom Y in image coordinates.
+        left = (prediction["left"] * image_width) / scale + ocr_rect.l
+        top = (prediction["lower"] * image_height) / scale + ocr_rect.t
+        right = (prediction["right"] * image_width) / scale + ocr_rect.l
+        bottom = (prediction["upper"] * image_height) / scale + ocr_rect.t
+        text = prediction["text"]
+
+        return TextCell(
+            index=index,
+            text=text,
+            orig=text,
+            from_ocr=True,
+            confidence=float(prediction["confidence"]),
+            rect=BoundingRectangle.from_bounding_box(
+                BoundingBox(
+                    l=left,
+                    t=top,
+                    r=right,
+                    b=bottom,
+                    coord_origin=CoordOrigin.TOPLEFT,
+                )
+            ),
+        )
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        if ocr_rect.area() == 0:
+                            continue
+
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        image_width, image_height = high_res_image.size
+                        image_array = numpy.array(high_res_image)
+
+                        raw_predictions = cast(
+                            Sequence[NemotronOcrPrediction],
+                            self.reader(
+                                image_array,
+                                merge_level=self.options.merge_level,
+                                visualize=False,
+                            ),
+                        )
+
+                        del high_res_image
+                        del image_array
+
+                        cells = [
+                            self._prediction_to_cell(
+                                prediction=prediction,
+                                index=index,
+                                ocr_rect=ocr_rect,
+                                image_width=image_width,
+                                image_height=image_height,
+                                scale=self.scale,
+                            )
+                            for index, prediction in enumerate(raw_predictions)
+                        ]
+                        all_ocr_cells.extend(cells)
+
+                    self.post_process_cells(all_ocr_cells, page)
+
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+                yield page
+
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return NemotronOcrOptions
diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
@@ -30,6 +30,7 @@
 
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
+    NemotronOcrOptions,
     PdfPipelineOptions,
     TableStructureOptions,
     TesseractCliOcrOptions,
@@ -49,8 +50,10 @@ def main():
     )
 
     # Any of the OCR options can be used: EasyOcrOptions, TesseractOcrOptions,
-    # TesseractCliOcrOptions, OcrMacOptions (macOS only), RapidOcrOptions
+    # TesseractCliOcrOptions, OcrMacOptions (macOS only), RapidOcrOptions,
+    # NemotronOcrOptions (Linux x86_64, Python 3.12, CUDA 13.x only)
     # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
+    # ocr_options = NemotronOcrOptions(force_full_page_ocr=True)
     # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
     # ocr_options = OcrMacOptions(force_full_page_ocr=True)
     # ocr_options = RapidOcrOptions(force_full_page_ocr=True)
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
@@ -53,6 +53,7 @@ The following table summarizes the extras available in the `docling` package. Th
 | `asr` | Installs dependencies for running the ASR pipeline. |
 | `vlm` | Installs dependencies for running the VLM pipeline. |
 | `easyocr` | Installs the [EasyOCR](https://github.com/JaidedAI/EasyOCR) OCR engine. |
+| `nemotron-ocr` | Installs NVIDIA Nemotron OCR. Supported only on Linux x86_64 with Python 3.12 and CUDA 13.x. |
 | `tesserocr` | Installs the Tesseract binding for using it as OCR engine. |
 | `ocrmac` | Installs the OcrMac OCR engine. |
 | `rapidocr` | Installs the [RapidOCR](https://github.com/RapidAI/RapidOCR) OCR engine with [onnxruntime](https://github.com/microsoft/onnxruntime/) backend. |
@@ -67,6 +68,7 @@ the following engines.
 | Engine | Installation | Usage |
 | ------ | ------------ | ----- |
 | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | `easyocr` extra or via `pip install easyocr`. | `EasyOcrOptions` |
+| [Nemotron OCR](https://huggingface.co/nvidia/nemotron-ocr-v1) | `nemotron-ocr` extra. Supported only on Linux x86_64 with Python 3.12 and CUDA 13.x. | `NemotronOcrOptions` |
 | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
 | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
 | OcrMac | System dependency. See description below. | `OcrMacOptions` |
@@ -141,5 +143,16 @@ doc_converter = DocumentConverter(
 To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
 
 ```bash
-uv sync --all-extras
+uv sync \
+  --extra asr \
+  --extra easyocr \
+  --extra ocrmac \
+  --extra rapidocr \
+  --extra remote-serving \
+  --extra tesserocr \
+  --extra vlm \
+  --extra xbrl
 ```
+
+The `nemotron-ocr` extra is intentionally excluded from the default development
+setup because it is only usable on Linux x86_64 with Python 3.12 and CUDA 13.x.
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_options.py b/tests/test_options.py
diff --git a/uv.lock b/uv.lock