|
18 | 18 | ) |
19 | 19 | from docling.datamodel.settings import settings |
20 | 20 | from docling.models.base_ocr_model import BaseOcrModel |
| 21 | +from docling.models.utils.hf_model_download import download_hf_model |
21 | 22 | from docling.utils.accelerator_utils import decide_device |
22 | 23 | from docling.utils.profiling import TimeRecorder |
23 | 24 |
|
@@ -53,6 +54,8 @@ class NemotronOcrPrediction(TypedDict): |
53 | 54 |
|
54 | 55 |
|
55 | 56 | class NemotronOcrModel(BaseOcrModel): |
| 57 | + _repo_id = "nvidia/nemotron-ocr-v1" |
| 58 | + |
56 | 59 | def __init__( |
57 | 60 | self, |
58 | 61 | enabled: bool, |
@@ -81,12 +84,10 @@ def __init__( |
81 | 84 | "Python 3.12 and CUDA 13.x." |
82 | 85 | ) from exc |
83 | 86 |
|
84 | | - model_dir = ( |
85 | | - str(self.options.model_dir) |
86 | | - if self.options.model_dir is not None |
87 | | - else None |
| 87 | + model_dir = self._resolve_model_dir(artifacts_path=artifacts_path) |
| 88 | + self.reader = NemotronOCR( |
| 89 | + model_dir=None if model_dir is None else str(model_dir) |
88 | 90 | ) |
89 | | - self.reader = NemotronOCR(model_dir=model_dir) |
90 | 91 | # Install the storage workaround only at the upstream grid-sampler |
91 | 92 | # boundary, keeping the rest of the Nemotron integration unchanged. |
92 | 93 | self.reader.grid_sampler = _GridSamplerStorageWorkaround( |
@@ -132,6 +133,51 @@ def _validate_runtime(cls, accelerator_options: AcceleratorOptions) -> None: |
132 | 133 | f"reports CUDA {cuda_version!r}." |
133 | 134 | ) |
134 | 135 |
|
| 136 | + @classmethod |
| 137 | + def _resolve_model_dir(cls, artifacts_path: Optional[Path]) -> Optional[Path]: |
| 138 | + if artifacts_path is None: |
| 139 | + return None |
| 140 | + |
| 141 | + repo_cache_folder = cls._repo_id.replace("/", "--") |
| 142 | + if (artifacts_path / repo_cache_folder).exists(): |
| 143 | + return artifacts_path / repo_cache_folder / "checkpoints" |
| 144 | + |
| 145 | + available_dirs = [] |
| 146 | + if artifacts_path.exists(): |
| 147 | + available_dirs = sorted( |
| 148 | + path.name for path in artifacts_path.iterdir() if path.is_dir() |
| 149 | + ) |
| 150 | + |
| 151 | + raise FileNotFoundError( |
| 152 | + "Nemotron OCR artifacts not found in artifacts_path.\n" |
| 153 | + f"Expected location: {artifacts_path / repo_cache_folder / 'checkpoints'}\n" |
| 154 | + f"Available directories in {artifacts_path}: {available_dirs}\n" |
| 155 | + "Use `docling-tools models download nemotron_ocr` to pre-download " |
| 156 | + "the checkpoints or unset artifacts_path to allow the upstream " |
| 157 | + "package to download them." |
| 158 | + ) |
| 159 | + |
| 160 | + @staticmethod |
| 161 | + def download_models( |
| 162 | + local_dir: Optional[Path] = None, |
| 163 | + force: bool = False, |
| 164 | + progress: bool = False, |
| 165 | + ) -> Path: |
| 166 | + if local_dir is None: |
| 167 | + local_dir = ( |
| 168 | + settings.cache_dir |
| 169 | + / "models" |
| 170 | + / NemotronOcrModel._repo_id.replace("/", "--") |
| 171 | + ) |
| 172 | + |
| 173 | + local_dir.mkdir(parents=True, exist_ok=True) |
| 174 | + return download_hf_model( |
| 175 | + repo_id=NemotronOcrModel._repo_id, |
| 176 | + local_dir=local_dir, |
| 177 | + force=force, |
| 178 | + progress=progress, |
| 179 | + ) |
| 180 | + |
135 | 181 | @staticmethod |
136 | 182 | def _prediction_to_cell( |
137 | 183 | prediction: NemotronOcrPrediction, |
|
0 commit comments