Adds task object-detection (#79)

xadupre · web-flow · commit 16fddf281d69 · 2025-05-02T16:01:34.000+02:00
* add object-detection

* update shapes
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -1,6 +1,12 @@
 Change Logs
 ===========
 
+0.4.4
++++++
+
+* :pr:`79`: implements task ``object-detection``
+* :pr:`78`: uses *onnx-weekly* instead of *onnx* to avoid conflicts with *onnxscript*
+
 0.4.3
 +++++
 
diff --git a/_doc/api/tasks/index.rst b/_doc/api/tasks/index.rst
@@ -41,6 +41,7 @@ Or:
     image_classification
     image_text_to_text
     mixture_of_expert
+    object_detection
     sentence_similarity
     text_classification
     text_generation
diff --git a/_doc/api/tasks/object_detection.rst b/_doc/api/tasks/object_detection.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.tasks.object_detection
+======================================
+
+.. automodule:: onnx_diagnostic.tasks.object_detection
+    :members:
+    :no-undoc-members:
diff --git a/_unittests/ut_tasks/test_tasks_object_detection.py b/_unittests/ut_tasks/test_tasks_object_detection.py
@@ -0,0 +1,28 @@
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout, has_transformers
+from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_export_patches import torch_export_patches
+from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+
+
+class TestTasks(ExtTestCase):
+    @hide_stdout()
+    def test_object_detection(self):
+        mid = "hustvl/yolos-tiny"
+        data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
+        self.assertEqual(data["task"], "object-detection")
+        self.assertIn((data["size"], data["n_weights"]), [(8160384, 2040096)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        model(**inputs)
+        model(**data["inputs2"])
+        if not has_transformers("4.51.999"):
+            raise unittest.SkipTest("Requires transformers>=4.52")
+        with torch_export_patches(patch_transformers=True, verbose=10):
+            torch.export.export(
+                model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
+            )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -502,6 +502,44 @@ def test_falcon_mamba_7b(self):
         for seq in sequences:
             print(f"Result: {seq['generated_text']}")
 
+    @never_test()
+    def test_object_detection(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k object_
+        # https://huggingface.co/hustvl/yolos-tiny
+
+        from transformers import YolosImageProcessor, YolosForObjectDetection
+        from PIL import Image
+        import torch
+        import requests
+
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        model = YolosForObjectDetection.from_pretrained("hustvl/yolos-tiny")
+        image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
+
+        inputs = image_processor(images=image, return_tensors="pt")
+        print()
+        print("-- inputs", string_type(inputs, with_shape=True, with_min_max=True))
+        outputs = model(**inputs)
+        print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))
+
+        # model predicts bounding boxes and corresponding COCO classes
+        # logits = outputs.logits
+        # bboxes = outputs.pred_boxes
+
+        # print results
+        target_sizes = torch.tensor([image.size[::-1]])
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.9, target_sizes=target_sizes
+        )[0]
+        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+            box = [round(i, 2) for i in box.tolist()]
+            print(
+                f"Detected {model.config.id2label[label.item()]} with confidence "
+                f"{round(score.item(), 3)} at location {box}"
+            )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -6,6 +6,7 @@
     image_classification,
     image_text_to_text,
     mixture_of_expert,
+    object_detection,
     sentence_similarity,
     text_classification,
     text_generation,
@@ -20,6 +21,7 @@
     image_classification,
     image_text_to_text,
     mixture_of_expert,
+    object_detection,
     sentence_similarity,
     text_classification,
     text_generation,
diff --git a/onnx_diagnostic/tasks/object_detection.py b/onnx_diagnostic/tasks/object_detection.py
@@ -0,0 +1,123 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "object-detection"
+
+
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, ("num_hidden_layers", "hidden_sizes"))
+    kwargs = dict(
+        num_hidden_layers=(
+            min(config.num_hidden_layers, 2)
+            if hasattr(config, "num_hidden_layers")
+            else len(config.hidden_sizes)
+        )
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    input_width: int,
+    input_height: int,
+    input_channels: int,
+    batch_size: int = 2,
+    dynamic_rope: bool = False,
+    add_second_input: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``object-detection``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param batch_size: batch size
+    :param input_channels: input channel
+    :param input_width: input width
+    :param input_height: input height
+    :return: dictionary
+    """
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_width {type(input_width)}{config}"
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_height {type(input_height)}{config}"
+
+    shapes = {
+        "pixel_values": {
+            0: torch.export.Dim("batch", min=1, max=1024),
+            2: "width",
+            3: "height",
+        }
+    }
+    inputs = dict(
+        pixel_values=torch.randn(batch_size, input_channels, input_width, input_height).clamp(
+            -1, 1
+        ),
+    )
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    if add_second_input:
+        res["inputs2"] = get_inputs(
+            model=model,
+            config=config,
+            input_width=input_width + 1,
+            input_height=input_height + 1,
+            input_channels=input_channels,
+            batch_size=batch_size + 1,
+            dynamic_rope=dynamic_rope,
+            **kwargs,
+        )["inputs"]
+    return res
+
+
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        if (
+            hasattr(config, "model_type")
+            and config.model_type == "timm_wrapper"
+            and not hasattr(config, "num_hidden_layers")
+        ):
+            input_size = config.pretrained_cfg["input_size"]
+            kwargs = dict(
+                batch_size=2,
+                input_width=input_size[-2],
+                input_height=input_size[-1],
+                input_channels=input_size[-3],
+            )
+            return kwargs, get_inputs
+
+        check_hasattr(config, ("image_size", "architectures"), "num_channels")
+    if config is not None:
+        if hasattr(config, "image_size"):
+            image_size = config.image_size
+        else:
+            assert config.architectures, f"empty architecture in {config}"
+            from ..torch_models.hghub.hub_api import get_architecture_default_values
+
+            default_values = get_architecture_default_values(config.architectures[0])
+            image_size = default_values["image_size"]
+    if config is None or isinstance(image_size, int):
+        kwargs = dict(
+            batch_size=2,
+            input_width=224 if config is None else image_size,
+            input_height=224 if config is None else image_size,
+            input_channels=3 if config is None else config.num_channels,
+        )
+    else:
+        kwargs = dict(
+            batch_size=2,
+            input_width=config.image_size[0],
+            input_height=config.image_size[1],
+            input_channels=config.num_channels,
+        )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py