Merge pull request #1777 from roboflow/feature/add-doctr-prefedined-models

PawelPeczek-Roboflow · web-flow · commit 035d78f90c97 · 2025-12-04T10:20:36.000+01:00
Add tests and fixes for DocTR model
diff --git a/inference_experimental/inference_exp/__init__.py b/inference_experimental/inference_exp/__init__.py
@@ -12,7 +12,7 @@
     MultiLabelClassificationPrediction,
 )
 from inference_exp.models.base.depth_estimation import DepthEstimationModel
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.base.embeddings import TextImageEmbeddingModel
 from inference_exp.models.base.instance_segmentation import (
     InstanceDetections,
diff --git a/inference_experimental/inference_exp/models/auto_loaders/core.py b/inference_experimental/inference_exp/models/auto_loaders/core.py
@@ -57,7 +57,7 @@
     MultiLabelClassificationModel,
 )
 from inference_exp.models.base.depth_estimation import DepthEstimationModel
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.base.embeddings import TextImageEmbeddingModel
 from inference_exp.models.base.instance_segmentation import InstanceSegmentationModel
 from inference_exp.models.base.keypoints_detection import KeyPointsDetectionModel
@@ -79,7 +79,7 @@
     ClassificationModel,
     MultiLabelClassificationModel,
     DepthEstimationModel,
-    DocumentParsingModel,
+    StructuredOCRModel,
     TextImageEmbeddingModel,
     InstanceSegmentationModel,
     KeyPointsDetectionModel,
diff --git a/inference_experimental/inference_exp/models/auto_loaders/models_registry.py b/inference_experimental/inference_exp/models/auto_loaders/models_registry.py
@@ -15,6 +15,7 @@
 CLASSIFICATION_TASK = "classification"
 MULTI_LABEL_CLASSIFICATION_TASK = "multi-label-classification"
 DEPTH_ESTIMATION_TASK = "depth-estimation"
+STRUCTURED_OCR_TASK = "structured-ocr"
 
 
 @dataclass(frozen=True)
@@ -356,8 +357,11 @@ class RegistryEntry:
     ),
     ("depth-anything-v2", DEPTH_ESTIMATION_TASK, BackendType.HF): LazyClass(
         module_name="inference_exp.models.depth_anything_v2.depth_anything_v2_hf",
-        class_name="DepthAnythingV2HF"
-    )
+        class_name="DepthAnythingV2HF",
+    ),
+    ("doctr", STRUCTURED_OCR_TASK, BackendType.TORCH): LazyClass(
+        module_name="inference_exp.models.doctr.doctr_torch", class_name="DocTR"
+    ),
 }
 
 
diff --git a/inference_experimental/inference_exp/models/base/documents_parsing.py b/inference_experimental/inference_exp/models/base/documents_parsing.py
@@ -11,15 +11,13 @@
 )
 
 
-class DocumentParsingModel(
+class StructuredOCRModel(
     ABC, Generic[PreprocessedInputs, PreprocessingMetadata, RawPrediction]
 ):
 
     @classmethod
     @abstractmethod
-    def from_pretrained(
-        cls, model_name_or_path: str, **kwargs
-    ) -> "DocumentParsingModel":
+    def from_pretrained(cls, model_name_or_path: str, **kwargs) -> "StructuredOCRModel":
         pass
 
     @property
diff --git a/inference_experimental/inference_exp/models/depth_anything_v2/depth_anything_v2_hf.py b/inference_experimental/inference_exp/models/depth_anything_v2/depth_anything_v2_hf.py
@@ -28,9 +28,7 @@ def from_pretrained(
             local_files_only=local_files_only,
         ).to(device)
         processor = AutoImageProcessor.from_pretrained(
-            model_name_or_path,
-            local_files_only=local_files_only,
-            use_fast=True
+            model_name_or_path, local_files_only=local_files_only, use_fast=True
         )
         return cls(model=model, processor=processor, device=device)
 
diff --git a/inference_experimental/inference_exp/models/doctr/doctr_torch.py b/inference_experimental/inference_exp/models/doctr/doctr_torch.py
@@ -1,88 +1,98 @@
-import os
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from doctr.io import Document
-from doctr.models import ocr_predictor
+from doctr.models import detection_predictor, ocr_predictor, recognition_predictor
 from inference_exp import Detections
 from inference_exp.configuration import DEFAULT_DEVICE
 from inference_exp.entities import ColorFormat, ImageDimensions
 from inference_exp.errors import CorruptedModelPackageError, ModelRuntimeError
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.common.model_packages import get_model_package_contents
 from inference_exp.utils.file_system import read_json
 
-WEIGHTS_NAMES_MAPPING = {
-    "db_resnet50": "db_resnet50-79bd7d70.pt",
-    "db_resnet34": "db_resnet34-cb6aed9e.pt",
-    "db_mobilenet_v3_large": "db_mobilenet_v3_large-21748dd0.pt",
-    "crnn_vgg16_bn": "crnn_vgg16_bn-9762b0b0.pt",
-    "crnn_mobilenet_v3_small": "crnn_mobilenet_v3_small_pt-3b919a02.pt",
-    "crnn_mobilenet_v3_large": "crnn_mobilenet_v3_large_pt-f5259ec2.pt",
+SUPPORTED_DETECTION_MODELS = {
+    "fast_base",
+    "fast_small",
+    "fast_tiny",
+    "db_resnet50",
+    "db_resnet34",
+    "db_mobilenet_v3_large",
+    "linknet_resnet18",
+    "linknet_resnet34",
+    "linknet_resnet50",
+}
+SUPPORTED_RECOGNITION_MODELS = {
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "master",
+    "sar_resnet31",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
 }
 
 
-class DocTR(DocumentParsingModel[List[np.ndarray], ImageDimensions, Document]):
+class DocTR(StructuredOCRModel[List[np.ndarray], ImageDimensions, Document]):
 
     @classmethod
     def from_pretrained(
         cls,
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
+        assume_straight_pages: bool = True,
+        preserve_aspect_ratio: bool = True,
+        detection_max_batch_size: int = 2,
+        recognition_max_batch_size: int = 128,
         **kwargs,
-    ) -> "DocumentParsingModel":
-        os.environ["DOCTR_CACHE_DIR"] = model_name_or_path
+    ) -> "StructuredOCRModel":
         model_package_content = get_model_package_contents(
             model_package_dir=model_name_or_path,
-            elements=["doctr_det", "doctr_rec", "config.json"],
+            elements=["detection_weights.pt", "recognition_weights.pt", "config.json"],
         )
         config = parse_model_config(config_path=model_package_content["config.json"])
-        os.makedirs(f"{model_name_or_path}/doctr_det/models/", exist_ok=True)
-        os.makedirs(f"{model_name_or_path}/doctr_rec/models/", exist_ok=True)
-        det_model_source_path = os.path.join(
-            model_name_or_path, "doctr_det", config.det_model, "model.pt"
-        )
-        rec_model_source_path = os.path.join(
-            model_name_or_path, "doctr_rec", config.rec_model, "model.pt"
-        )
-        if not os.path.exists(det_model_source_path):
-            raise CorruptedModelPackageError(
-                message="Could not initialize DocTR model - could not find detection model weights.",
-                help_url="https://todo",
-            )
-        if not os.path.exists(rec_model_source_path):
-            raise CorruptedModelPackageError(
-                message="Could not initialize DocTR model - could not find recognition model weights.",
-                help_url="https://todo",
-            )
-        if config.det_model not in WEIGHTS_NAMES_MAPPING:
+        if config.det_model not in SUPPORTED_DETECTION_MODELS:
             raise CorruptedModelPackageError(
                 message=f"{config.det_model} model denoted in configuration not supported as DocTR detection model.",
                 help_url="https://todo",
             )
-        if config.rec_model not in WEIGHTS_NAMES_MAPPING:
+        if config.rec_model not in SUPPORTED_RECOGNITION_MODELS:
             raise CorruptedModelPackageError(
-                message=f"{config.det_model} model denoted in configuration not supported as DocTR recognition model.",
+                message=f"{config.rec_model} model denoted in configuration not supported as DocTR recognition model.",
                 help_url="https://todo",
             )
-        det_model_target_path = os.path.join(
-            model_name_or_path, "models", WEIGHTS_NAMES_MAPPING[config.det_model]
+        det_model = detection_predictor(
+            arch=config.det_model,
+            pretrained=False,
+            assume_straight_pages=assume_straight_pages,
+            preserve_aspect_ratio=preserve_aspect_ratio,
+            batch_size=detection_max_batch_size,
+        )
+        det_model.model.to(device)
+        detector_weights = torch.load(
+            model_package_content["detection_weights.pt"],
+            weights_only=True,
+            map_location=device,
+        )
+        det_model.model.load_state_dict(detector_weights)
+        rec_model = recognition_predictor(
+            arch=config.rec_model,
+            pretrained=False,
+            batch_size=recognition_max_batch_size,
         )
-        rec_model_target_path = os.path.join(
-            model_name_or_path, "models", WEIGHTS_NAMES_MAPPING[config.rec_model]
+        rec_model.model.to(device)
+        rec_weights = torch.load(
+            model_package_content["recognition_weights.pt"],
+            weights_only=True,
+            map_location=device,
         )
-        if os.path.exists(det_model_target_path):
-            os.remove(det_model_target_path)
-        os.symlink(det_model_source_path, det_model_target_path)
-        if os.path.exists(rec_model_target_path):
-            os.remove(rec_model_target_path)
-        os.symlink(rec_model_source_path, rec_model_target_path)
+        rec_model.model.load_state_dict(rec_weights)
         model = ocr_predictor(
-            det_arch=config.det_model,
-            reco_arch=config.rec_model,
-            pretrained=True,
+            det_arch=det_model.model,
+            reco_arch=rec_model.model,
         ).to(device=device)
         return cls(model=model, device=device)
 
diff --git a/inference_experimental/inference_exp/models/moondream2/moondream2_hf.py b/inference_experimental/inference_exp/models/moondream2/moondream2_hf.py
@@ -37,8 +37,8 @@ def from_pretrained(
         if torch.mps.is_available():
             raise ModelRuntimeError(
                 message=f"This model cannot run on Apple device with MPS unit - original implementation contains bug "
-                        f"preventing proper allocation of tensors which causes runtime error. Run this model on the "
-                        f"machine with Nvidia GPU or x86 CPU.",
+                f"preventing proper allocation of tensors which causes runtime error. Run this model on the "
+                f"machine with Nvidia GPU or x86 CPU.",
                 help_url="https://todo",
             )
         model_package_content = get_model_package_contents(
diff --git a/inference_experimental/inference_exp/models/rfdetr/rfdetr_instance_segmentation_pytorch.py b/inference_experimental/inference_exp/models/rfdetr/rfdetr_instance_segmentation_pytorch.py
@@ -124,7 +124,9 @@ def from_pretrained(
         model_config = CONFIG_FOR_MODEL_TYPE[model_type](device=device)
         checkpoint_num_classes = weights_dict["class_embed.bias"].shape[0]
         model_config.num_classes = checkpoint_num_classes - 1
-        model_config.resolution = inference_config.network_input.training_input_size.height
+        model_config.resolution = (
+            inference_config.network_input.training_input_size.height
+        )
         model = build_model(config=model_config)
         model.load_state_dict(weights_dict)
         model = model.eval().to(device)
diff --git a/inference_experimental/inference_exp/models/rfdetr/rfdetr_object_detection_pytorch.py b/inference_experimental/inference_exp/models/rfdetr/rfdetr_object_detection_pytorch.py
@@ -130,7 +130,9 @@ def from_pretrained(
         model_config = CONFIG_FOR_MODEL_TYPE[model_type](device=device)
         checkpoint_num_classes = weights_dict["class_embed.bias"].shape[0]
         model_config.num_classes = checkpoint_num_classes - 1
-        model_config.resolution = inference_config.network_input.training_input_size.height
+        model_config.resolution = (
+            inference_config.network_input.training_input_size.height
+        )
         model = build_model(config=model_config)
         model.load_state_dict(weights_dict)
         model = model.eval().to(device)
diff --git a/inference_experimental/tests/e2e_platform_tests/conftest.py b/inference_experimental/tests/e2e_platform_tests/conftest.py
@@ -6,13 +6,13 @@
 import requests
 from filelock import FileLock
 
-ASSETS_DIR = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "assets")
-)
+ASSETS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "assets"))
 DOG_IMAGE_PATH = os.path.join(ASSETS_DIR, "images", "dog.jpeg")
 DOG_IMAGE_URL = (
     "https://storage.googleapis.com/roboflow-tests-assets/test-images/dog.jpeg"
 )
+OCR_TEST_IMAGE_PATH = os.path.join(ASSETS_DIR, "ocr_test_image.png")
+OCR_TEST_IMAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/test-images/ocr_test_image.png"
 
 
 @pytest.fixture()
@@ -28,6 +28,14 @@ def dog_image_numpy() -> np.ndarray:
     return image
 
 
+@pytest.fixture(scope="function")
+def ocr_test_image_numpy() -> np.ndarray:
+    _download_if_not_exists(file_path=OCR_TEST_IMAGE_PATH, url=OCR_TEST_IMAGE_URL)
+    image = cv2.imread(OCR_TEST_IMAGE_PATH)
+    assert image is not None, "Could not load OCR test image"
+    return image
+
+
 def _download_if_not_exists(file_path: str, url: str, lock_timeout: int = 180) -> None:
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     lock_path = f"{file_path}.lock"
diff --git a/inference_experimental/tests/e2e_platform_tests/test_depth_anything_v2.py b/inference_experimental/tests/e2e_platform_tests/test_depth_anything_v2.py
@@ -1,14 +1,17 @@
 import numpy as np
 import pytest
-
 from inference_exp import AutoModel
 
 
 @pytest.mark.e2e_model_inference
 @pytest.mark.gpu_only
-def test_depth_anything_v2_small(roboflow_api_key: str, dog_image_numpy: np.ndarray) -> None:
+def test_depth_anything_v2_small(
+    roboflow_api_key: str, dog_image_numpy: np.ndarray
+) -> None:
     # given
-    model = AutoModel.from_pretrained("depth-anything-v2/small", api_key=roboflow_api_key)
+    model = AutoModel.from_pretrained(
+        "depth-anything-v2/small", api_key=roboflow_api_key
+    )
 
     # when
     results = model(dog_image_numpy)
@@ -19,7 +22,9 @@ def test_depth_anything_v2_small(roboflow_api_key: str, dog_image_numpy: np.ndar
 
 @pytest.mark.e2e_model_inference
 @pytest.mark.gpu_only
-def test_depth_anything_v2_small_via_alias(roboflow_api_key: str, dog_image_numpy: np.ndarray) -> None:
+def test_depth_anything_v2_small_via_alias(
+    roboflow_api_key: str, dog_image_numpy: np.ndarray
+) -> None:
     # given
     model = AutoModel.from_pretrained("depth-anything-v2", api_key=roboflow_api_key)
 
@@ -32,9 +37,13 @@ def test_depth_anything_v2_small_via_alias(roboflow_api_key: str, dog_image_nump
 
 @pytest.mark.e2e_model_inference
 @pytest.mark.gpu_only
-def test_depth_anything_v2_base(roboflow_api_key: str, dog_image_numpy: np.ndarray) -> None:
+def test_depth_anything_v2_base(
+    roboflow_api_key: str, dog_image_numpy: np.ndarray
+) -> None:
     # given
-    model = AutoModel.from_pretrained("depth-anything-v2/base", api_key=roboflow_api_key)
+    model = AutoModel.from_pretrained(
+        "depth-anything-v2/base", api_key=roboflow_api_key
+    )
 
     # when
     results = model(dog_image_numpy)
@@ -45,9 +54,13 @@ def test_depth_anything_v2_base(roboflow_api_key: str, dog_image_numpy: np.ndarr
 
 @pytest.mark.e2e_model_inference
 @pytest.mark.gpu_only
-def test_depth_anything_v2_large(roboflow_api_key: str, dog_image_numpy: np.ndarray) -> None:
+def test_depth_anything_v2_large(
+    roboflow_api_key: str, dog_image_numpy: np.ndarray
+) -> None:
     # given
-    model = AutoModel.from_pretrained("depth-anything-v2/large", api_key=roboflow_api_key)
+    model = AutoModel.from_pretrained(
+        "depth-anything-v2/large", api_key=roboflow_api_key
+    )
 
     # when
     results = model(dog_image_numpy)
diff --git a/inference_experimental/tests/e2e_platform_tests/test_doctr_e2e.py b/inference_experimental/tests/e2e_platform_tests/test_doctr_e2e.py
@@ -0,0 +1,44 @@
+import numpy as np
+import pytest
+from inference_exp import AutoModel, Detections
+from inference_exp.configuration import DEFAULT_DEVICE
+
+
+@pytest.mark.e2e_model_inference
+def test_default_ocr_package(
+    ocr_test_image_numpy: np.ndarray, roboflow_api_key: str
+) -> None:
+    # given
+    model = AutoModel.from_pretrained(
+        "doctr-dbnet-rn50/crnn-vgg16",
+        api_key=roboflow_api_key,
+        device=DEFAULT_DEVICE,
+    )
+
+    # when
+    result = model(ocr_test_image_numpy)
+
+    # then
+    assert len(result) == 2
+    assert result[0][0] == "This is a test image for OCR."
+    assert isinstance(result[1][0], Detections)
+
+
+@pytest.mark.e2e_model_inference
+def test_non_default_ocr_package(
+    ocr_test_image_numpy: np.ndarray, roboflow_api_key: str
+) -> None:
+    # given
+    model = AutoModel.from_pretrained(
+        "doctr-linknet-rn18/crnn-vgg16",
+        api_key=roboflow_api_key,
+        device=DEFAULT_DEVICE,
+    )
+
+    # when
+    result = model(ocr_test_image_numpy)
+
+    # then
+    assert len(result) == 2
+    assert result[0][0] == "This is a test image for OCR."
+    assert isinstance(result[1][0], Detections)
diff --git a/inference_experimental/tests/integration_tests/conftest.py b/inference_experimental/tests/integration_tests/conftest.py
diff --git a/inference_experimental/tests/integration_tests/models/conftest.py b/inference_experimental/tests/integration_tests/models/conftest.py
diff --git a/inference_experimental/tests/integration_tests/models/test_depth_anything_v2_predictions.py b/inference_experimental/tests/integration_tests/models/test_depth_anything_v2_predictions.py
diff --git a/inference_experimental/tests/integration_tests/models/test_doctr_predictions.py b/inference_experimental/tests/integration_tests/models/test_doctr_predictions.py
diff --git a/inference_experimental/tests/integration_tests/models/test_moondream2_predictions.py b/inference_experimental/tests/integration_tests/models/test_moondream2_predictions.py

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`MultiLabelClassificationPrediction,`
`13`	`13`	`)`
`14`	`14`	`from inference_exp.models.base.depth_estimation import DepthEstimationModel`
`15`		`-from inference_exp.models.base.documents_parsing import DocumentParsingModel`
	`15`	`+from inference_exp.models.base.documents_parsing import StructuredOCRModel`
`16`	`16`	`from inference_exp.models.base.embeddings import TextImageEmbeddingModel`
`17`	`17`	`from inference_exp.models.base.instance_segmentation import (`
`18`	`18`	`InstanceDetections,`