enhancement: make detectron2_onnx default (#108)

qued · web-flow · commit 5a8237f4bee7 · 2023-05-18T14:33:37.000Z
Makes the ONNX version of detectron2 the default model. This means users can use it without the pain of installing detectron2. I also cleaned up a few things.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
         python${{ env.PYTHON_VERSION }} -m venv .venv
         source .venv/bin/activate
         make install-ci
-        
+
   lint:
     runs-on: ubuntu-latest
     needs: setup
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
-## 0.4.5-dev1
+## 0.4.5
 
 * Preserve image format in PIL.Image.Image when loading
-* Added ONNX version of Detectron2
+* Added ONNX version of Detectron2 and make default model
 
 ## 0.4.4
 
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Run `pip install unstructured-inference`.
 
 ### Detectron2
 
-[Detectron2](https://github.com/facebookresearch/detectron2) is required for most inference tasks 
+[Detectron2](https://github.com/facebookresearch/detectron2) is required for using models from the [layoutparser model zoo](#using-models-from-the-layoutparser-model-zoo) 
 but is not automatically installed with this package. 
 For MacOS and Linux, build from source with:
 ```shell
@@ -66,6 +66,33 @@ Once the model has detected the layout and OCR'd the document, the text extracte
 page of the sample document will be displayed.
 You can convert a given element to a `dict` by running the `.to_dict()` method.
 
+## Models
+
+The inference pipeline operates by finding text elements in a document page using a detection model, then extracting the contents of the elements using direct extraction (if available), OCR, and optionally table inference models.
+
+We offer several detection models including [Detectron2](https://github.com/facebookresearch/detectron2) and [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX).
+
+### Using a non-default model
+
+When doing inference, an alternate model can be used by passing the model object to the ingestion method via the `model` parameter. The `get_model` function can be used to construct one of our out-of-the-box models from a keyword, e.g.:
+```python
+from unstructured_inference.models.base import get_model
+from unstructured_inference.inference.layout import DocumentLayout
+
+model = get_model("yolox")
+layout = DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf", model=model)
+```
+
+### Using models from the layoutparser model zoo
+
+The `UnstructuredDetectronModel` class in `unstructured_inference.modelts.detectron2` uses the `faster_rcnn_R_50_FPN_3x` model pretrained on DocLayNet, but by using different construction parameters, any model in the `layoutparser` [model zoo](https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html) can be used. `UnstructuredDetectronModel` is a light wrapper around the `layoutparser` `Detectron2LayoutModel` object, and accepts the same arguments. See [layoutparser documentation](https://layout-parser.readthedocs.io/en/latest/api_doc/models.html#layoutparser.models.Detectron2LayoutModel) for details.
+
+### Using your own model
+
+Any detection model can be used for in the `unstructured_inference` pipeline by wrapping the model in the `UnstructuredObjectDetectionModel` class. To integrate with the `DocumentLayout` class, a subclass of `UnstructuredObjectDetectionModel` must have a `predict` method that accepts a `PIL.Image.Image` and returns a list of `LayoutElement`s, and an `initialize` method, which loads the model and prepares it for inference.
+
+## API
+
 To build the Docker container, run `make docker-build`. Note that Apple hardware with an M1 chip 
 has trouble building `Detectron2` on Docker and for best results you should build it on Linux. To 
 run the API locally, use `make start-app-local`. You can stop the API with `make stop-app-local`. 
@@ -90,7 +117,7 @@ start the API with hot reloading. The API will run at `http:/localhost:8000`.
 
 View the swagger documentation at `http://localhost:5000/docs`.
 
-## YoloX model
+### YoloX model
 
 For using the YoloX model the endpoints are: 
 ```
diff --git a/setup.py b/setup.py
@@ -18,6 +18,7 @@
 limitations under the License.
 """
 from setuptools import setup, find_packages
+from typing import List
 
 from unstructured_inference.__version__ import __version__
 
@@ -27,11 +28,11 @@ def load_requirements(file_list=None):
         file_list = ["requirements/base.in"]
     if isinstance(file_list, str):
         file_list = [file_list]
-    requirements = []
+    requirements: List[str] = []
     for file in file_list:
-        if not file.startswith("#"):
-            with open(file, encoding="utf-8") as f:
-                requirements.extend(f.readlines())
+        with open(file, encoding="utf-8") as f:
+            requirements.extend(f.readlines())
+    requirements = [req for req in requirements if not req.startswith("#")]
     return requirements
 
 
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -119,7 +119,8 @@ def test_read_pdf(monkeypatch, mock_page_layout):
     monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
 
     with patch.object(layout, "load_pdf", return_value=(layouts, images)):
-        doc = layout.DocumentLayout.from_file("fake-file.pdf")
+        model = layout.get_model("detectron2_lp")
+        doc = layout.DocumentLayout.from_file("fake-file.pdf", model=model)
 
         assert str(doc).startswith("A Catchy Title")
         assert str(doc).count("A Catchy Title") == 2  # Once for each page
diff --git a/test_unstructured_inference/models/test_detectron2.py b/test_unstructured_inference/models/test_detectron2.py
@@ -18,15 +18,15 @@ def test_load_default_model(monkeypatch):
     monkeypatch.setattr(detectron2, "Detectron2LayoutModel", MockDetectron2LayoutModel)
 
     with patch.object(detectron2, "is_detectron2_available", return_value=True):
-        model = models.get_model()
+        model = models.get_model("detectron2_lp")
 
     assert isinstance(model.model, MockDetectron2LayoutModel)
 
 
 def test_load_default_model_raises_when_not_available():
     with patch.object(detectron2, "is_detectron2_available", return_value=False):
         with pytest.raises(ImportError):
-            models.get_model()
+            models.get_model("detectron2_lp")
 
 
 @pytest.mark.parametrize("config_path, model_path", [("asdf", "diufs"), ("dfaw", "hfhfhfh")])
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.5-dev1"  # pragma: no cover
+__version__ = "0.4.5"  # pragma: no cover
diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py
@@ -14,11 +14,16 @@
     UnstructuredYoloXModel,
 )
 
+DEFAULT_MODEL = "detectron2_onnx"
+
 
 def get_model(model_name: Optional[str] = None) -> UnstructuredModel:
     """Gets the model object by model name."""
     # TODO(alan): These cases are similar enough that we can probably do them all together with
     # importlib
+    if model_name is None:
+        model_name = DEFAULT_MODEL
+
     if model_name in DETECTRON2_MODEL_TYPES:
         model: UnstructuredModel = UnstructuredDetectronModel()
         model.initialize(**DETECTRON2_MODEL_TYPES[model_name])
diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py
@@ -11,7 +11,7 @@
 
 from unstructured_inference.logger import logger
 from unstructured_inference.inference.layoutelement import LayoutElement
-from unstructured_inference.models.unstructuredmodel import UnstructuredModel
+from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
 from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
 
 
@@ -29,7 +29,7 @@
 # NOTE(alan): Entries are implemented as LazyDicts so that models aren't downloaded until they are
 # needed.
 MODEL_TYPES = {
-    None: LazyDict(
+    "detectron2_lp": LazyDict(
         model_path=LazyEvaluateInfo(
             hf_hub_download,
             "layoutparser/detectron2",
@@ -56,7 +56,7 @@
 }
 
 
-class UnstructuredDetectronModel(UnstructuredModel):
+class UnstructuredDetectronModel(UnstructuredObjectDetectionModel):
     """Unstructured model wrapper for Detectron2LayoutModel."""
 
     def predict(self, x: Image):
diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
@@ -6,7 +6,7 @@
 
 from unstructured_inference.logger import logger
 from unstructured_inference.inference.layoutelement import LayoutElement
-from unstructured_inference.models.unstructuredmodel import UnstructuredModel
+from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
 from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
 import onnxruntime
 import numpy as np
@@ -37,7 +37,7 @@
 }
 
 
-class UnstructuredDetectronONNXModel(UnstructuredModel):
+class UnstructuredDetectronONNXModel(UnstructuredObjectDetectionModel):
     """Unstructured model wrapper for detectron2 ONNX model."""
 
     # The model was trained and exported with this shape
diff --git a/unstructured_inference/models/donut.py b/unstructured_inference/models/donut.py
@@ -64,5 +64,9 @@ def run_prediction(self, x: Image):
             return_dict_in_generate=True,
         )
         prediction = self.processor.batch_decode(outputs.sequences)[0]
+        # NOTE(alan): As of right now I think this would not work if passed in as the model to
+        # DocumentLayout.from_file and similar functions that take a model object as input. This
+        # produces image-to-text inferences rather than image-to-bboxes, so we actually need to
+        # hook it up in a different way.
         prediction = self.processor.token2json(prediction)
         return prediction
diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py
@@ -1,5 +1,11 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import TYPE_CHECKING, Any, List
+
+from PIL.Image import Image
+
+if TYPE_CHECKING:
+    from unstructured_inference.inference.layoutelement import LayoutElement
 
 
 class UnstructuredModel(ABC):
@@ -22,7 +28,7 @@ def predict(self, x: Any) -> Any:
             )
         pass  # pragma: no cover
 
-    def __call__(self, x: Any):
+    def __call__(self, x: Any) -> Any:
         """Inference using function call interface."""
         return self.predict(x)
 
@@ -32,5 +38,19 @@ def initialize(self, *args, **kwargs):
         pass  # pragma: no cover
 
 
+class UnstructuredObjectDetectionModel(UnstructuredModel):
+    """Wrapper class for object detection models used by unstructured."""
+
+    @abstractmethod
+    def predict(self, x: Image) -> List[LayoutElement]:
+        """Do inference using the wrapped model."""
+        super().predict(x)
+        return []  # pragma: no cover
+
+    def __call__(self, x: Image) -> List[LayoutElement]:
+        """Inference using function call interface."""
+        return super().__call__(x)
+
+
 class ModelNotInitializedError(Exception):
     pass

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.5-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.4.5" # pragma: no cover`