Merge pull request #149 from FocoosAI/fix/onnx-cpu-infer

CuriousDolphin · web-flow · commit d961704c43b1 · 2025-09-09T18:11:24.000+02:00
fix(InferModel): onnx cpu inference by add "auto" device parameter
diff --git a/docs/cli.md b/docs/cli.md
@@ -251,7 +251,7 @@ The interface will automatically open in your default web browser, typically at
 | `--source`     | Input source (predict only) | **Required**   | `image.jpg`                        |
 | `--im-size`    | Input image size            | 640            | Any positive integer               |
 | `--batch-size` | Batch size                  | 16             | Powers of 2 recommended            |
-| `--device`     | Compute device              | `cuda`         | `cuda`, `cpu`, `mps`               |
+| `--device`     | Compute device              | `cuda`         | `cuda`, `cpu`               |
 | `--workers`    | Data loading workers        | 4              | 0-16 recommended                   |
 | `--output-dir` | Output directory            | Auto-generated | Any valid path                     |
 
diff --git a/focoos/infer/infer_model.py b/focoos/infer/infer_model.py
@@ -22,7 +22,7 @@
 import os
 from pathlib import Path
 from time import perf_counter
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 import numpy as np
 import supervision as sv
@@ -41,7 +41,7 @@
 )
 from focoos.processor.processor_manager import ProcessorManager
 from focoos.utils.logger import get_logger
-from focoos.utils.system import get_cpu_name, get_device_name
+from focoos.utils.system import get_cpu_name, get_device_name, get_device_type
 from focoos.utils.vision import (
     annotate_frame,
     image_loader,
@@ -55,6 +55,7 @@ def __init__(
         self,
         model_dir: Union[str, Path],
         runtime_type: Optional[RuntimeType] = None,
+        device: Literal["cuda", "cpu", "auto"] = "auto",
     ):
         """
         Initialize a LocalModel instance.
@@ -90,7 +91,12 @@ def __init__(
         # Determine runtime type and model format
         runtime_type = runtime_type or FOCOOS_CONFIG.runtime_type
         extension = ModelExtension.from_runtime_type(runtime_type)
-
+        if device == "auto":
+            self.device = get_device_type()
+        elif runtime_type == RuntimeType.ONNX_CPU:
+            self.device = "cpu"
+        else:
+            self.device = device
         # Set model directory and path
         self.model_dir: Union[str, Path] = model_dir
         self.model_path = os.path.join(model_dir, f"model.{extension.value}")
@@ -111,7 +117,7 @@ def __init__(
             model_config = ConfigManager.from_dict(self.model_info.model_family, self.model_info.config)
             self.processor = ProcessorManager.get_processor(
                 self.model_info.model_family, model_config, self.model_info.im_size
-            )
+            ).eval()
         except Exception as e:
             logger.error(f"Error creating model config: {e}")
             raise e
@@ -123,10 +129,11 @@ def __init__(
 
         # Load runtime for inference
         self.runtime: BaseRuntime = load_runtime(
-            runtime_type,
-            str(self.model_path),
-            self.model_info,
-            FOCOOS_CONFIG.warmup_iter,
+            runtime_type=runtime_type,
+            model_path=str(self.model_path),
+            model_info=self.model_info,
+            warmup_iter=FOCOOS_CONFIG.warmup_iter,
+            device=self.device,
         )
 
     def _read_model_info(self) -> ModelInfo:
@@ -175,7 +182,7 @@ def infer(
         t0 = perf_counter()
         im = image_loader(image)
         t1 = perf_counter()
-        tensors, _ = self.processor.preprocess(inputs=im, device="cuda")
+        tensors, _ = self.processor.preprocess(inputs=im, device=self.device)
         # logger.debug(f"Input image size: {im.shape}")
         t2 = perf_counter()
 
diff --git a/focoos/infer/runtimes/load_runtime.py b/focoos/infer/runtimes/load_runtime.py
@@ -1,3 +1,5 @@
+from typing import Literal
+
 from focoos.infer.runtimes.base import BaseRuntime
 from focoos.ports import ModelInfo, OnnxRuntimeOpts, RuntimeType, TorchscriptRuntimeOpts
 from focoos.utils.logger import get_logger
@@ -25,6 +27,7 @@ def load_runtime(
     model_path: str,
     model_info: ModelInfo,
     warmup_iter: int = 50,
+    device: Literal["cuda", "cpu", "auto"] = "auto",
 ) -> BaseRuntime:
     """
     Creates and returns a runtime instance based on the specified runtime type.
@@ -57,7 +60,7 @@ def load_runtime(
         from focoos.infer.runtimes.torchscript import TorchscriptRuntime
 
         opts = TorchscriptRuntimeOpts(warmup_iter=warmup_iter)
-        return TorchscriptRuntime(model_path=model_path, opts=opts, model_info=model_info)
+        return TorchscriptRuntime(model_path=model_path, opts=opts, model_info=model_info, device=device)
     else:
         if not ORT_AVAILABLE:
             logger.error(
diff --git a/focoos/infer/runtimes/torchscript.py b/focoos/infer/runtimes/torchscript.py
@@ -1,13 +1,13 @@
 from time import perf_counter
-from typing import Tuple, Union
+from typing import Literal, Tuple, Union
 
 import numpy as np
 import torch
 
 from focoos.infer.runtimes.base import BaseRuntime
 from focoos.ports import LatencyMetrics, ModelInfo, Task, TorchscriptRuntimeOpts
 from focoos.utils.logger import get_logger
-from focoos.utils.system import get_cpu_name, get_device_name
+from focoos.utils.system import get_cpu_name, get_device_name, get_device_type
 
 logger = get_logger("TorchscriptRuntime")
 
@@ -32,8 +32,12 @@ def __init__(
         model_path: str,
         opts: TorchscriptRuntimeOpts,
         model_info: ModelInfo,
+        device: Literal["cuda", "cpu", "auto"] = "auto",
     ):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if device == "auto":
+            self.device = torch.device(get_device_type())
+        else:
+            self.device = torch.device(device)
         logger.info(f"🔧 Device: {self.device}")
         self.opts = opts
         self.model_info = model_info
@@ -49,7 +53,7 @@ def __init__(
             )
             logger.info(f"⏱️ Warming up model {self.model_info.name} on {self.device}, size: {size}x{size}..")
             with torch.no_grad():
-                np_image = torch.rand(1, 3, size, size, device=self.device)
+                np_image = torch.rand(1, 3, size, size).to(self.device)
                 for _ in range(self.opts.warmup_iter):
                     self.model(np_image)
 
diff --git a/focoos/models/focoos_model.py b/focoos/models/focoos_model.py
@@ -32,7 +32,7 @@
 from focoos.utils.distributed.dist import launch
 from focoos.utils.env import TORCH_VERSION
 from focoos.utils.logger import get_logger
-from focoos.utils.system import get_cpu_name, get_device_name, get_focoos_version, get_system_info
+from focoos.utils.system import get_cpu_name, get_device_name, get_device_type, get_focoos_version, get_system_info
 from focoos.utils.vision import annotate_frame, image_loader
 
 logger = get_logger("FocoosModel")
@@ -393,7 +393,7 @@ def export(
         runtime_type: RuntimeType = RuntimeType.TORCHSCRIPT_32,
         onnx_opset: int = 17,
         out_dir: Optional[str] = None,
-        device: Literal["cuda", "cpu"] = "cuda",
+        device: Literal["cuda", "cpu", "auto"] = "auto",
         overwrite: bool = True,
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
     ) -> InferModel:
@@ -416,9 +416,12 @@ def export(
         Raises:
             ValueError: If unsupported PyTorch version or export format.
         """
-        if device == "cuda" and not torch.cuda.is_available():
-            device = "cpu"
-            logger.warning("CUDA is not available. Using CPU for export.")
+        if device == "auto":
+            device = get_device_type()  # type: ignore
+        else:
+            device = device
+
+        logger.info(f"🔧 Export Device: {device}")
         if out_dir is None:
             out_dir = os.path.join(MODELS_DIR, self.model_info.ref or self.model_info.name)
 
diff --git a/focoos/utils/system.py b/focoos/utils/system.py
@@ -7,7 +7,9 @@
 import time
 import zipfile
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import List, Literal, Optional, Union
+
+import torch
 
 from focoos.ports import GPUInfo
 from focoos.utils.distributed import comm
@@ -413,3 +415,10 @@ def get_device_name() -> str:
     else:
         cpu_name = get_cpu_name()
         return cpu_name if cpu_name is not None else "CPU"
+
+
+def get_device_type() -> Literal["cuda", "cpu"]:
+    if torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
@@ -147,11 +147,19 @@ def test_load_runtime(mocker: MockerFixture, tmp_path, runtime_type, expected_op
 
     # assertions
     assert runtime is not None
-    mock_runtime_class.assert_called_once_with(
-        model_path,
-        expected_opts,
-        mock_model_metadata,
-    )
+    if runtime_type == RuntimeType.TORCHSCRIPT_32:
+        mock_runtime_class.assert_called_once_with(
+            model_path=model_path,
+            opts=expected_opts,
+            model_info=mock_model_metadata,
+            device="auto",
+        )
+    else:
+        mock_runtime_class.assert_called_once_with(
+            model_path,
+            expected_opts,
+            mock_model_metadata,
+        )
 
 
 def test_load_unavailable_runtime(mocker: MockerFixture):
diff --git a/uv.lock b/uv.lock