python API samples

gedoensmax · gedoensmax · commit 2ca537d1a1f1 · 2025-09-18T18:28:14.000+02:00
diff --git a/python/README.md b/python/README.md
@@ -2,7 +2,8 @@
 
 ## API 
 
-[Run the ONNX Runtime session creation and inference API](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/api)
+The [api directory](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/api) contains samples that demonstrate how to use the ONNX Runtime Python API. 
+These samples show very minimal API usage that is not execution provider specific.
 
 ## OpenVINO Execution Provider
 
diff --git a/python/api/README.md b/python/api/README.md
@@ -0,0 +1,25 @@
+# Python API Samples
+
+This directory contains sample scripts demonstrating various ONNX Runtime Python API features:
+
+- `getting_started.py`  
+  Introduces the basics of exporting a simple PyTorch model to ONNX, running inference with ONNX Runtime, and handling inputs/outputs as NumPy arrays.
+
+- `compile_api.py`  
+  Shows how to programmatically compile an ONNX model for a specific execution provider (e.g., TensorRT RTX) to an [EP context](https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html) ONNX. The sample measures model load and compile times to demonstrate performance improvements and has the option to specify an input model.
+  - For `NvTensorRTRTXExecutionProvider` try adding the provider option for a runtime cache (`-p NvTensorRTRTXExecutionProvider -popt "nv_runtime_cache_path=./cache"`) which will further increase the load speed of a compiled model.
+
+- `device_bindings.py`  
+  Demonstrates advanced device bindings, including running ONNX models on CPU or GPU, using ONNX Runtime's `OrtValue` for device memory, and direct inference with PyTorch tensors on the selected device. It also demonstrates how to interact with ORT using dlpack.
+
+Each sample is self-contained and includes comments explaining the main concepts.
+
+### Setup 
+
+Besides installing the ONNX Runtime package there are some other dependencies for the samples to work correctly. 
+Please pick your selected [onnxruntime package](https://onnxruntime.ai/docs/get-started/with-python.html#install-onnx-runtime) manually.
+```
+pip install -r requirements.txt
+# to install ORT GPU with required cuda dependencies
+pip install onnxruntime-gpu[cuda,cudnn]
+```
diff --git a/python/api/compile_api.py b/python/api/compile_api.py
@@ -0,0 +1,117 @@
+import argparse
+import os
+import time
+import onnxruntime as ort
+
+# Set logger severity to warning level to reduce console output.
+ort.set_default_logger_severity(3)
+
+# Default Execution Provider for NVIDIA GPUs as requested.
+TRT_RTX_EP = "NvTensorRTRTXExecutionProvider"
+
+
+def compile(input_path, output_path, provider, ep_options, embed_mode=False):
+    """
+    Compiles an ONNX model for a specified execution provider and saves it.
+    
+    Args:
+        input_path (str): Path to the original ONNX model.
+        output_path (str): Path to save the compiled model.
+        provider (str): The name of the execution provider.
+        embed_mode (bool): If True, embeds the compiled binary data into the ONNX file.
+    """
+    # Remove the output file if it already exists to ensure a clean compilation.
+    if os.path.exists(output_path):
+        os.remove(output_path)
+        print(f"> Previous compiled model at {output_path} removed.")
+
+    # Create session options and add the provider.
+    session_options = ort.SessionOptions()
+    session_options.add_provider(provider, ep_options)
+
+    # Create a ModelCompiler instance using positional arguments.
+    model_compiler = ort.ModelCompiler(
+        session_options,
+        input_path,
+        embed_compiled_data_into_model=embed_mode
+    )
+
+    print(f"\n> Compiling model with '{provider}'...")
+    start = time.perf_counter()
+    # Execute the compilation process.
+    model_compiler.compile_to_file(output_path)
+    stop = time.perf_counter()
+
+    if os.path.exists(output_path):
+        print("> Compiled successfully!")
+        print(f"> Compile time: {stop - start:.3f} sec")
+        print(f"> Compiled model saved at {output_path}")
+
+
+def load_session(model_path, provider, ep_options):
+    """
+    Loads an ONNX model into an InferenceSession and measures the loading time.
+
+    Args:
+        model_path (str): Path to the ONNX model file.
+        provider (str): The name of the execution provider.
+        ep_options (dict): The execution provider options.
+    """
+    # Create the list of providers with an empty dictionary for options.
+
+    start = time.perf_counter()
+    # Load the model using the specified provider.
+    # session_options = ort.SessionOptions()
+    # session_options.add_provider(provider, ep_options)
+    # session = ort.InferenceSession(model_path, sess_options=session_options)
+    session = ort.InferenceSession(model_path, providers=[(provider, ep_options)])
+    stop = time.perf_counter()
+
+    print(f"> Session load time: {stop - start:.3f} sec")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compile ONNX model with ONNX Runtime")
+    parser.add_argument("-i", "--model_path", type=str, default=None, help="Path to the ONNX model file")
+    parser.add_argument("-o", "--output_path", type=str, default="model_ctx.onnx",
+                        help="Path to save the compiled EP context model")
+    parser.add_argument("-p", "--provider", default=TRT_RTX_EP, type=str, help="Execution Provider")
+    parser.add_argument("-popt", "--provider_options", default=[], type=str, nargs="+",
+                        help="Execution Provider options as key=value pairs")
+    # Using a type=bool for the embed flag.
+    parser.add_argument("--embed", action=argparse.BooleanOptionalAction, help="Binary data embedded within EP context node")
+    args = parser.parse_args()
+
+    if args.model_path is None:
+        from getting_started import create_model
+
+        args.model_path = create_model()
+    ep_options = {}
+    for kv_pair in args.provider_options:
+        key, value = kv_pair.split("=")
+        ep_options[key] = value
+
+    print(f"""
+    -----------------------------------------------
+    ONNX Runtime Model Compilation Script
+    -----------------------------------------------
+    "> Using Execution Provider: {args.provider}
+    "> Using Execution Provider options: {ep_options}
+    "> Embed Mode: {'Embedded' if args.embed else 'External'}
+    -----------------------------------------------
+    Available execution provider(s) {ort.get_available_providers()}
+    """)
+
+    # Load and time the original model.
+    print("\n> Loading regular onnx...")
+    load_session(args.model_path, args.provider, ep_options=ep_options)
+
+    # Compile the model.
+    compile(args.model_path, args.output_path, args.provider,
+            ep_options=ep_options, embed_mode=args.embed)
+
+    # Load and time the compiled model.
+    print("\n> Loading EP context model...")
+    load_session(args.output_path, args.provider, ep_options=ep_options)
+
+    print("\nProgram finished successfully.")
diff --git a/python/api/device_bindings.py b/python/api/device_bindings.py
@@ -4,12 +4,15 @@
 
 import numpy as np
 import torch
+import os
+import re
 import onnxruntime
 
 MODEL_FILE = '.model.onnx'
 DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
 DEVICE_INDEX = 0     # Replace this with the index of the device you want to run on
 DEVICE=f'{DEVICE_NAME}:{DEVICE_INDEX}'
+LIB_EXT = 'so' if os.name != 'nt' else 'dll'
 
 # A simple model to calculate addition of two tensors
 def model():
@@ -32,39 +35,38 @@ def create_model(type: torch.dtype = torch.float32):
  
 # Create an ONNX Runtime session with the provided model
 def create_session(model: str) -> onnxruntime.InferenceSession:
+    available_providers = {device.ep_name for device in  onnxruntime.get_ep_devices()}
     providers = ['CPUExecutionProvider']
     if torch.cuda.is_available():
-        providers.insert(0, 'CUDAExecutionProvider')
+        if 'CUDAExecutionProvider' in available_providers:
+            providers.insert(0, 'CUDAExecutionProvider')
+        if 'NvTensorRTRTXExecutionProvider' in available_providers:
+            providers.insert(0, 'NvTensorRTRTXExecutionProvider')
     return onnxruntime.InferenceSession(model, providers=providers)
 
-# Run the model on CPU consuming and producing numpy arrays 
-def run(x: np.array, y: np.array) -> np.array:
-    session = create_session(MODEL_FILE)
-
-    z = session.run(["z"], {"x": x, "y": y})
-    
-    return z[0]   
 
 # Run the model on device consuming and producing ORTValues
 def run_with_data_on_device(x: np.array, y: np.array) -> onnxruntime.OrtValue:
     session = create_session(MODEL_FILE)
+    mem_info = session.get_input_memory_infos()[0]
 
-    x_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(x, DEVICE_NAME, DEVICE_INDEX)
-    y_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(y, DEVICE_NAME, DEVICE_INDEX)
+    x_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(x, 'gpu', device_id=mem_info.device_id, vendor_id=mem_info.device_vendor_id)
+    y_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(y, 'gpu', device_id=mem_info.device_id, vendor_id=mem_info.device_vendor_id)
 
     io_binding = session.io_binding()
-    io_binding.bind_input(name='x', device_type=x_ortvalue.device_name(), device_id=0, element_type=x.dtype, shape=x_ortvalue.shape(), buffer_ptr=x_ortvalue.data_ptr())
-    io_binding.bind_input(name='y', device_type=y_ortvalue.device_name(), device_id=0, element_type=y.dtype, shape=y_ortvalue.shape(), buffer_ptr=y_ortvalue.data_ptr())
-    io_binding.bind_output(name='z', device_type=DEVICE_NAME, device_id=DEVICE_INDEX, element_type=x.dtype, shape=x_ortvalue.shape())
+    io_binding.bind_input(name='x', device_type=x_ortvalue.device_name(), device_id=mem_info.device_id, element_type=x.dtype, shape=x_ortvalue.shape(), buffer_ptr=x_ortvalue.data_ptr())
+    io_binding.bind_input(name='y', device_type=y_ortvalue.device_name(), device_id=mem_info.device_id, element_type=y.dtype, shape=y_ortvalue.shape(), buffer_ptr=y_ortvalue.data_ptr())
+    io_binding.bind_output(name='z', device_type=x_ortvalue.device_name(), device_id=mem_info.device_id, element_type=x.dtype, shape=x_ortvalue.shape())
     session.run_with_iobinding(io_binding)
 
     z = io_binding.get_outputs()
 
     return z[0]
 
 # Run the model on device consuming and producing native PyTorch tensors
-def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type: np.dtype = np.float32, torch_type: torch.dtype = torch.float32) -> torch.Tensor:
+def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type: np.dtype = np.float32, torch_type: torch.dtype = torch.float32, dlpack=False) -> torch.Tensor:
     session = create_session(MODEL_FILE)
+    mem_info = session.get_input_memory_infos()[0]
 
     binding = session.io_binding()
 
@@ -73,48 +75,69 @@ def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type:
 
     binding.bind_input(
         name='x',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
+        device_type="gpu",
+        device_id=mem_info.device_id,
         element_type=np_type,
         shape=tuple(x_tensor.shape),
         buffer_ptr=x_tensor.data_ptr(),
         )
 
     binding.bind_input(
         name='y',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
+        device_type="gpu",
+        device_id=mem_info.device_id,
         element_type=np_type,
         shape=tuple(y_tensor.shape),
         buffer_ptr=y_tensor.data_ptr(),
         )
-
-    ## Allocate the PyTorch tensor for the model output
-    z_tensor = torch.empty(x_tensor.shape, dtype=torch_type, device=DEVICE).contiguous()
-    binding.bind_output(
-        name='z',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
-        element_type=np_type,
-        shape=tuple(z_tensor.shape),
-        buffer_ptr=z_tensor.data_ptr(),
-    )
+    if dlpack:
+        binding.bind_output(
+            name='z',
+            device_type="gpu",
+        )
+    else:
+        ## Allocate the PyTorch tensor for the model output
+        z_tensor = torch.empty(x_tensor.shape, dtype=torch_type, device=DEVICE).contiguous()
+        binding.bind_output(
+            name='z',
+            device_type="gpu",
+            device_id=mem_info.device_id,
+            element_type=np_type,
+            shape=tuple(z_tensor.shape),
+            buffer_ptr=z_tensor.data_ptr(),
+        )
 
     session.run_with_iobinding(binding)
-
-    return z_tensor
+    if dlpack:
+        from onnxruntime.capi import _pybind_state as C
+        outputs = binding.get_outputs()
+        return torch.tensor(C.OrtValue.from_dlpack(outputs[0]._ortvalue.to_dlpack(), False))
+    else:
+        return z_tensor
 
 
 def main():
-    create_model()
+    # check if plugin based providers are available and register them
+    ort_capi_dir = os.path.dirname(onnxruntime.capi.__file__)
+    for p in  os.listdir(ort_capi_dir):
+        match = re.match(r".*onnxruntime_providers_(.*)\."+LIB_EXT, p)
+        if match is not None:
+            ep_name = match.group(1)
+            if ep_name == 'shared': continue
+            onnxruntime.register_execution_provider_library(ep_name, os.path.join(ort_capi_dir, p))
+            print(f"Registered execution provider {ep_name} with library: {p}")
 
-    print(run(x=np.float32([1.0, 2.0, 3.0]),y=np.float32([4.0, 5.0, 6.0])))
-    # [array([5., 7., 9.], dtype=float32)]
+    create_model()
 
     print(run_with_data_on_device(x=np.float32([1.0, 2.0, 3.0, 4.0, 5.0]), y=np.float32([1.0, 2.0, 3.0, 4.0, 5.0])).numpy())
     # [ 2.  4.  6.  8. 10.]
 
-    print(run_with_torch_tensors_on_device(torch.rand(5).to(DEVICE), torch.rand(5).to(DEVICE)))
+    x = torch.rand(5).to(DEVICE)
+    y = torch.rand(5).to(DEVICE)
+    print(run_with_torch_tensors_on_device(x, y, dlpack=True))
+    # tensor([0.7023, 1.3127, 1.7289, 0.3982, 0.8386])
+
+    print(run_with_torch_tensors_on_device(x, y, dlpack=False))
     # tensor([0.7023, 1.3127, 1.7289, 0.3982, 0.8386])
 
     create_model(torch.int64)
diff --git a/python/api/getting_started.py b/python/api/getting_started.py
@@ -0,0 +1,54 @@
+# A set of code samples showing different usage of the ONNX Runtime Python API
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+import torch
+import onnxruntime
+
+MODEL_FILE = '.model.onnx'
+DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+# A simple model to calculate addition of two tensors
+def model():
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super(Model, self).__init__()
+
+        def forward(self, x, y):
+            return x.add(y)
+
+    return Model()
+
+# Create an instance of the model and export it to ONNX graph format, with dynamic size for the data
+def create_model(type: torch.dtype = torch.float32):
+    sample_x = torch.ones(3, dtype=type)
+    sample_y = torch.zeros(3, dtype=type)
+
+    torch.onnx.export(model(), (sample_x, sample_y), MODEL_FILE, input_names=["x", "y"], output_names=["z"],
+                      dynamic_axes={"x": {0 : "array_length_x"}, "y": {0: "array_length_y"}})
+    return MODEL_FILE
+
+# Create an ONNX Runtime session with the provided model
+def create_session(model: str) -> onnxruntime.InferenceSession:
+    providers = ['CPUExecutionProvider']
+    if torch.cuda.is_available():
+        providers.insert(0, 'CUDAExecutionProvider')
+    return onnxruntime.InferenceSession(model, providers=providers)
+
+# Run the model on CPU consuming and producing numpy arrays
+def run(x: np.array, y: np.array) -> np.array:
+    session = create_session(MODEL_FILE)
+
+    z = session.run(["z"], {"x": x, "y": y})
+
+    return z[0]
+
+def main():
+    create_model()
+
+    print(run(x=np.float32([1.0, 2.0, 3.0]),y=np.float32([4.0, 5.0, 6.0])))
+    # [array([5., 7., 9.], dtype=float32)]
+
+if __name__ == "__main__":
+    main()
diff --git a/python/api/requirements.txt b/python/api/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+torch
+onnx
+--extra-index-url https://download.pytorch.org/whl/cu128

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +numpy
 +torch
 +onnx
 +--extra-index-url https://download.pytorch.org/whl/cu128