Skip to content

Commit c51e1c6

Browse files
authored
[onert/python] Support inference benchmark (#15192)
This commit adds inference benchmark sample and latency measurement support. - Added `inference_benchmark.py` sample for measuring inference latency and memory usage - Supports static shape override via `--input-shape` - Measures MODEL_LOAD / PREPARE / EXECUTE / PEAK memory (RSS) and I/O / run latency - Updated `session.infer()` API to optionally return latency metrics using `measure=True` - Fixed potential memory accumulation in `set_inputs()` and `set_outputs()` by clearing internal buffers each call - Added `_time_block()` context manager for clean latency measurement implementation ONE-DCO-1.0-Signed-off-by: ragmani <ragmani0216@gmail.com>
1 parent 83c200e commit c51e1c6

File tree

3 files changed

+169
-8
lines changed

3 files changed

+169
-8
lines changed

runtime/onert/api/python/package/common/basesession.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ def set_inputs(self, size, inputs_array=[]):
9090
raise ValueError(
9191
"Session is not initialized with a model. Please compile with a model before setting inputs."
9292
)
93+
94+
self.inputs = []
9395
for i in range(size):
9496
input_tensorinfo = self.session.input_tensorinfo(i)
9597

@@ -115,6 +117,8 @@ def set_outputs(self, size):
115117
raise ValueError(
116118
"Session is not initialized with a model. Please compile a model before setting outputs."
117119
)
120+
121+
self.outputs = []
118122
for i in range(size):
119123
output_tensorinfo = self.session.output_tensorinfo(i)
120124
output_array = np.zeros((num_elems(output_tensorinfo)),

runtime/onert/api/python/package/infer/session.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
from typing import List, Any
1+
from typing import List, Union, Tuple, Dict
22
import numpy as np
3+
import time
4+
from contextlib import contextmanager
35

46
from ..native.libnnfw_api_pybind import infer, tensorinfo
57
from ..common.basesession import BaseSession
@@ -57,7 +59,12 @@ def update_inputs_tensorinfo(self, new_infos: List[tensorinfo]) -> None:
5759
f"{info.dims[:info.rank]}")
5860
self.session.set_input_tensorinfo(i, info)
5961

60-
def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
62+
def infer(
63+
self,
64+
inputs_array: List[np.ndarray],
65+
*,
66+
measure: bool = False
67+
) -> Union[List[np.ndarray], Tuple[List[np.ndarray], Dict[str, float]]]:
6168
"""
6269
Run a complete inference cycle:
6370
- If the session has not been prepared or outputs have not been set, call prepare() and set_outputs().
@@ -72,15 +79,22 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
7279
7380
Args:
7481
inputs_array (list[np.ndarray]): List of numpy arrays representing the input data.
82+
measure (bool): If True, measure prepare/io/run latencies (ms).
7583
7684
Returns:
7785
list[np.ndarray]: A list containing the output numpy arrays.
86+
OR
87+
(outputs, metrics): Tuple where metrics is a dict with keys
88+
'prepare_time_ms', 'io_time_ms', 'run_time_ms'
7889
"""
90+
metrics: Dict[str, float] = {}
91+
7992
# Check if the session is prepared. If not, call prepare() and set_outputs() once.
8093
if not self._prepared:
81-
self.session.prepare()
82-
self.set_outputs(self.session.output_size())
83-
self._prepared = True
94+
with self._time_block(metrics, 'prepare_time_ms', measure):
95+
self.session.prepare()
96+
self.set_outputs(self.session.output_size())
97+
self._prepared = True
8498

8599
# Verify that the number of provided inputs matches the session's expected input count.
86100
expected_input_size: int = self.session.input_size()
@@ -90,8 +104,23 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
90104
)
91105

92106
# Configure input buffers using the current session's input size and provided data.
93-
self.set_inputs(expected_input_size, inputs_array)
107+
with self._time_block(metrics, 'io_time_ms', measure):
108+
self.set_inputs(expected_input_size, inputs_array)
109+
94110
# Execute the inference.
95-
self.session.run()
111+
with self._time_block(metrics, 'run_time_ms', measure):
112+
self.session.run()
113+
114+
# TODO: Support dynamic shapes for outputs.
115+
96116
# Return the output buffers.
97-
return self.outputs
117+
return (self.outputs.copy(), metrics) if measure else self.outputs.copy()
118+
119+
@contextmanager
120+
def _time_block(self, metrics: Dict[str, float], key: str, mesure: bool):
121+
if mesure:
122+
start = time.perf_counter()
123+
yield
124+
metrics[key] = (time.perf_counter() - start) * 1000
125+
else:
126+
yield
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import argparse
2+
import numpy as np
3+
import psutil
4+
import os
5+
from typing import List
6+
from onert import infer
7+
# TODO: Import tensorinfo from onert
8+
from onert.native.libnnfw_api_pybind import tensorinfo
9+
10+
11+
def get_memory_usage_mb() -> float:
12+
"""Get current process memory usage in MB."""
13+
process = psutil.Process(os.getpid())
14+
return process.memory_info().rss / (1024 * 1024)
15+
16+
17+
def parse_shapes(shape_strs: List[str]) -> List[List[int]]:
18+
shapes = []
19+
for s in shape_strs:
20+
try:
21+
shapes.append([int(dim) for dim in s.strip().split(",")])
22+
except Exception:
23+
raise ValueError(f"Invalid shape string: '{s}' (expected: 1,224,224,3 ...)")
24+
return shapes
25+
26+
27+
def apply_static_shapes(sess: infer.session,
28+
static_shapes: List[List[int]]) -> List[tensorinfo]:
29+
original_infos = sess.get_inputs_tensorinfo()
30+
if len(static_shapes) != len(original_infos):
31+
raise ValueError(
32+
f"Input count mismatch: model expects {len(original_infos)} inputs, but got {len(static_shapes)} shapes"
33+
)
34+
35+
updated_infos: List[tensorinfo] = []
36+
37+
for i, info in enumerate(original_infos):
38+
shape = static_shapes[i]
39+
if info.rank != len(shape):
40+
raise ValueError(
41+
f"Rank mismatch for input {i}: expected rank {info.rank}, got {len(shape)}"
42+
)
43+
info.dims = shape
44+
info.rank = len(shape)
45+
updated_infos.append(info)
46+
47+
sess.update_inputs_tensorinfo(updated_infos)
48+
return updated_infos
49+
50+
51+
def benchmark_inference(nnpackage_path: str, backends: str, input_shapes: List[List[int]],
52+
repeat: int):
53+
mem_before_kb = get_memory_usage_mb() * 1024
54+
55+
sess = infer.session(path=nnpackage_path, backends=backends)
56+
model_load_kb = get_memory_usage_mb() * 1024 - mem_before_kb
57+
58+
input_infos = apply_static_shapes(
59+
sess, input_shapes) if input_shapes else sess.get_inputs_tensorinfo()
60+
61+
# Create dummy input arrays
62+
dummy_inputs = []
63+
for info in input_infos:
64+
shape = tuple(info.dims[:info.rank])
65+
dummy_inputs.append(np.random.rand(*shape).astype(info.dtype))
66+
67+
prepare = total_io = total_run = 0.0
68+
69+
# Warmup runs
70+
prepare_kb = 0
71+
for _ in range(3):
72+
outputs, metrics = sess.infer(dummy_inputs, measure=True)
73+
del outputs
74+
if "prepare_time_ms" in metrics:
75+
prepare = metrics["prepare_time_ms"]
76+
prepare_kb = get_memory_usage_mb() * 1024 - mem_before_kb
77+
78+
# Benchmark runs
79+
for _ in range(repeat):
80+
outputs, metrics = sess.infer(dummy_inputs, measure=True)
81+
del outputs
82+
total_io += metrics["io_time_ms"]
83+
total_run += metrics["run_time_ms"]
84+
85+
execute_kb = get_memory_usage_mb() * 1024 - mem_before_kb
86+
87+
print("======= Inference Benchmark =======")
88+
print(f"- Warmup runs : 3")
89+
print(f"- Measured runs : {repeat}")
90+
print(f"- Prepare : {prepare:.3f} ms")
91+
print(f"- Avg I/O : {total_io / repeat:.3f} ms")
92+
print(f"- Avg Run : {total_run / repeat:.3f} ms")
93+
print("===================================")
94+
print("RSS")
95+
print(f"- MODEL_LOAD : {model_load_kb:.0f} KB")
96+
print(f"- PREPARE : {prepare_kb:.0f} KB")
97+
print(f"- EXECUTE : {execute_kb:.0f} KB")
98+
print(f"- PEAK : {max(model_load_kb, prepare_kb, execute_kb):.0f} KB")
99+
print("===================================")
100+
101+
102+
# TODO: Support dynamic(on-the-fly) shape
103+
def main():
104+
parser = argparse.ArgumentParser(description="ONERT Inference Benchmark")
105+
parser.add_argument("nnpackage", type=str, help="Path to .nnpackage directory")
106+
parser.add_argument("--backends",
107+
type=str,
108+
default="cpu",
109+
help="Backends to use (default: cpu)")
110+
parser.add_argument("--input-shape",
111+
nargs="+",
112+
help="Input shapes for each input (e.g. 1,224,224,3 1,10)")
113+
parser.add_argument("--repeat",
114+
type=int,
115+
default=5,
116+
help="Number of measured inference repetitions")
117+
118+
args = parser.parse_args()
119+
shapes = parse_shapes(args.input_shape) if args.input_shape else None
120+
121+
benchmark_inference(nnpackage_path=args.nnpackage,
122+
backends=args.backends,
123+
input_shapes=shapes,
124+
repeat=args.repeat)
125+
126+
127+
if __name__ == "__main__":
128+
main()

0 commit comments

Comments
 (0)