Added ONNX model for detectron2 (#103)

benjats07 · qued · web-flow · commit ce402d843247 · 2023-05-10T17:32:07.000-05:00
* Added ONNX model for detectron2

* Update tests

* Add comments

* Make original detectron2 and onnx separate models

* Deletes detectron2 dependency from Makefile

---------

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;
Co-authored-by: Alan Bertl &lt;alan@unstructured.io&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.5-dev0
+
+* Added ONNX version of Detectron2
+
 ## 0.4.4
 
 * Fixed patches not being a package.
diff --git a/test_unstructured_inference/models/test_detectron2onnx.py b/test_unstructured_inference/models/test_detectron2onnx.py
@@ -0,0 +1,71 @@
+import os
+from unittest.mock import patch
+
+import pytest
+from PIL import Image
+
+import unstructured_inference.models.detectron2onnx as detectron2
+import unstructured_inference.models.base as models
+
+
+class MockDetectron2ONNXLayoutModel:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def run(self, *args):
+        return ([(1, 2, 3, 4)], [0], [0.818], [(4, 5)])
+
+    def get_inputs(self):
+        class input_thing:
+            name = "Bernard"
+
+        return [input_thing()]
+
+
+def test_load_default_model():
+    with patch.object(
+        detectron2.onnxruntime, "InferenceSession", new=MockDetectron2ONNXLayoutModel
+    ):
+        model = models.get_model("detectron2_onnx")
+
+    assert isinstance(model.model, MockDetectron2ONNXLayoutModel)
+
+
+@pytest.mark.parametrize(("model_path", "label_map"), [("asdf", "diufs"), ("dfaw", "hfhfhfh")])
+def test_load_model(model_path, label_map):
+    with patch.object(detectron2.onnxruntime, "InferenceSession", return_value=True):
+        model = detectron2.UnstructuredDetectronONNXModel()
+        model.initialize(model_path=model_path, label_map=label_map)
+        args, _ = detectron2.onnxruntime.InferenceSession.call_args
+        assert args == (model_path,)
+    assert label_map == model.label_map
+
+
+def test_unstructured_detectron_model():
+    model = detectron2.UnstructuredDetectronONNXModel()
+    model.model = 1
+    with patch.object(detectron2.UnstructuredDetectronONNXModel, "predict", return_value=[]):
+        result = model(None)
+    assert isinstance(result, list)
+    assert len(result) == 0
+
+
+def test_inference():
+    with patch.object(
+        detectron2.onnxruntime, "InferenceSession", return_value=MockDetectron2ONNXLayoutModel()
+    ):
+        model = detectron2.UnstructuredDetectronONNXModel()
+        model.initialize(model_path="test_path", label_map={0: "test_class"})
+        assert isinstance(model.model, MockDetectron2ONNXLayoutModel)
+        with open(os.path.join("sample-docs", "receipt-sample.jpg"), mode="rb") as fp:
+            image = Image.open(fp)
+            image.load()
+        elements = model(image)
+        assert len(elements) == 1
+        element = elements[0]
+        (x1, y1), _, (x2, y2), _ = element.coordinates
+        # NOTE(alan): The bbox coordinates get resized, so check their relative proportions
+        assert x2 / x1 == pytest.approx(3.0)  # x1 == 1, x2 == 3 before scaling
+        assert y2 / y1 == pytest.approx(2.0)  # y1 == 2, y2 == 4 before scaling
+        assert element.type == "test_class"
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.4"  # pragma: no cover
+__version__ = "0.4.5-dev0"  # pragma: no cover
diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py
@@ -5,6 +5,10 @@
     MODEL_TYPES as DETECTRON2_MODEL_TYPES,
     UnstructuredDetectronModel,
 )
+from unstructured_inference.models.detectron2onnx import (
+    MODEL_TYPES as DETECTRON2_ONNX_MODEL_TYPES,
+    UnstructuredDetectronONNXModel,
+)
 from unstructured_inference.models.yolox import (
     MODEL_TYPES as YOLOX_MODEL_TYPES,
     UnstructuredYoloXModel,
@@ -18,6 +22,9 @@ def get_model(model_name: Optional[str] = None) -> UnstructuredModel:
     if model_name in DETECTRON2_MODEL_TYPES:
         model: UnstructuredModel = UnstructuredDetectronModel()
         model.initialize(**DETECTRON2_MODEL_TYPES[model_name])
+    elif model_name in DETECTRON2_ONNX_MODEL_TYPES:
+        model = UnstructuredDetectronONNXModel()
+        model.initialize(**DETECTRON2_ONNX_MODEL_TYPES[model_name])
     elif model_name in YOLOX_MODEL_TYPES:
         model = UnstructuredYoloXModel()
         model.initialize(**YOLOX_MODEL_TYPES[model_name])
diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
@@ -0,0 +1,118 @@
+from typing import Final, Optional, Union, Dict, List
+from pathlib import Path
+
+from PIL import Image
+from huggingface_hub import hf_hub_download
+
+from unstructured_inference.logger import logger
+from unstructured_inference.inference.layoutelement import LayoutElement
+from unstructured_inference.models.unstructuredmodel import UnstructuredModel
+from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
+import onnxruntime
+import numpy as np
+import cv2
+
+
+DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
+    0: "Text",
+    1: "Title",
+    2: "List",
+    3: "Table",
+    4: "Figure",
+}
+
+
+# NOTE(alan): Entries are implemented as LazyDicts so that models aren't downloaded until they are
+# needed.
+MODEL_TYPES: Dict[Optional[str], LazyDict] = {
+    "detectron2_onnx": LazyDict(
+        model_path=LazyEvaluateInfo(
+            hf_hub_download,
+            "unstructuredio/detectron2_faster_rcnn_R_50_FPN_3x",
+            "model.onnx",
+        ),
+        label_map=DEFAULT_LABEL_MAP,
+        confidence_threshold=0.8,
+    ),
+}
+
+
+class UnstructuredDetectronONNXModel(UnstructuredModel):
+    """Unstructured model wrapper for detectron2 ONNX model."""
+
+    # The model was trained and exported with this shape
+    required_w = 800
+    required_h = 1035
+
+    def predict(self, image: Image.Image) -> List[LayoutElement]:
+        """Makes a prediction using detectron2 model."""
+        super().predict(image)
+
+        prepared_input = self.preprocess(image)
+        bboxes, labels, confidence_scores, _ = self.model.run(None, prepared_input)
+        input_w, input_h = image.size
+        regions = self.postprocess(bboxes, labels, confidence_scores, input_w, input_h)
+
+        return regions
+
+    def initialize(
+        self,
+        model_path: Union[str, Path],
+        label_map: Dict[int, str],
+        confidence_threshold: Optional[float] = None,
+    ):
+        """Loads the detectron2 model using the specified parameters"""
+        logger.info("Loading the Detectron2 layout model ...")
+        self.model = onnxruntime.InferenceSession(model_path, providers=["CPUExecutionProvider"])
+        self.label_map = label_map
+        if confidence_threshold is None:
+            confidence_threshold = 0.5
+        self.confidence_threshold = confidence_threshold
+
+    def preprocess(self, image: Image.Image) -> Dict[str, np.ndarray]:
+        """Process input image into required format for ingestion into the Detectron2 ONNX binary.
+        This involves resizing to a fixed shape and converting to a specific numpy format."""
+        # TODO (benjamin): check other shapes for inference
+        img = np.array(image)
+        # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model
+        session = self.model
+        # onnx input expected
+        # [3,1035,800]
+        img = cv2.resize(
+            img,
+            (self.required_w, self.required_h),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.float32)
+        img = img.transpose(2, 0, 1)
+        ort_inputs = {session.get_inputs()[0].name: img}
+        return ort_inputs
+
+    def postprocess(
+        self,
+        bboxes: np.ndarray,
+        labels: np.ndarray,
+        confidence_scores: np.ndarray,
+        input_w: float,
+        input_h: float,
+    ) -> List[LayoutElement]:
+        """Process output into Unstructured class. Bounding box coordinates are converted to
+        original image resolution."""
+        regions = []
+        width_conversion = input_w / self.required_w
+        height_conversion = input_h / self.required_h
+        for (x1, y1, x2, y2), label, conf in zip(bboxes, labels, confidence_scores):
+            detected_class = self.label_map[int(label)]
+            if conf >= self.confidence_threshold:
+                region = LayoutElement(
+                    x1 * width_conversion,
+                    y1 * height_conversion,
+                    x2 * width_conversion,
+                    y2 * height_conversion,
+                    text=None,
+                    type=detected_class,
+                )
+
+                regions.append(region)
+
+        regions.sort(key=lambda element: element.y1)
+        return regions

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.4" # pragma: no cover`
	`1`	`+__version__ = "0.4.5-dev0" # pragma: no cover`