[aoti-et] Store symbols from dlopen into AOTIDelegateHandle (#15172)

larryliu0820 · web-flow · commit 2bea318430e2 · 2025-10-15T16:54:53.000-07:00
This pull request refactors how function pointers for AOTI model
container operations are managed and loaded in the CUDA backend. Instead
of relying on global/static function pointers, function pointers are now
stored per-instance in the `AOTIDelegateHandle` structure. This change
enables safe handling of multiple shared libraries within the same
process and improves encapsulation and maintainability.

**Refactoring function pointer management:**

* Removed global function pointers for AOTI model container operations
from `aoti_model_container.cpp` and `aoti_model_container.h`, and moved
them into the `AOTIDelegateHandle` struct as per-instance members.
[[1]](diffhunk://#diff-32ff58ae0581446607da6874fa62b366ba18bcff4d621b16987fda78312244a6L1-L39)
[[2]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7L63-L92)
[[3]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7R83-R89)

**CUDA backend updates:**

* Updated the CUDA backend (`cuda_backend.cpp`) to load function
pointers into each `AOTIDelegateHandle` instance using a new
`load_function_pointers_into_handle` method, replacing the previous
global symbol registration logic. All calls to model container functions
now use the handle's member function pointers.
[[1]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L31-R35)
[[2]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L60-R84)
[[3]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L138-L150)
[[4]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L168-R194)
[[5]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L264-R277)

**Build system adjustments:**

* Removed `aoti_model_container.cpp` from the build targets and library
sources, as global function pointer definitions are no longer needed.
[[1]](diffhunk://#diff-c95a0b47f516c30f4b2e384b88c94c088d1031e6df7af66678a6fc9d3fb1a1a5L29-L31)
[[2]](diffhunk://#diff-c3d5933d211acc568c9bdf8e08d0ca99b01e50bca113307fbab4cbc4018fdf55L29-R29)
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
@@ -26,7 +26,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 # Common AOTI functionality - combines all AOTI common components
-set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
+set(_aoti_common_sources common_shims.cpp)
 add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
   aoti_common
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
@@ -60,36 +60,17 @@ using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
     AOTInductorStreamHandle stream_handle,
     AOTIProxyExecutorHandle proxy_executor_handle);
 
-// Global function pointers (will be loaded dynamically)
-extern AOTInductorModelContainerCreateWithDeviceFunc
-    AOTInductorModelContainerCreateWithDevice;
-extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete;
-extern AOTInductorModelContainerGetNumInputsFunc
-    AOTInductorModelContainerGetNumInputs;
-extern AOTInductorModelContainerGetNumOutputsFunc
-    AOTInductorModelContainerGetNumOutputs;
-extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
-
 // Retrieves the name of an input tensor by index from the AOTI model container.
-// Needed by Metal backend
 using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t input_idx,
     const char** input_name);
 
 // Retrieves the number of constants from the AOTI model container.
-// Needed by Metal backend
 using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
-// Global function pointers (will be loaded dynamically).
-// Needed by Metal backend
-extern AOTInductorModelContainerGetInputNameFunc
-    AOTInductorModelContainerGetInputName;
-extern AOTInductorModelContainerGetNumConstantsFunc
-    AOTInductorModelContainerGetNumConstants;
-
 } // extern "C"
 
 // AOTI Delegate Handle structure
@@ -99,6 +80,13 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerHandle container_handle;
   void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
                      // dependency
+
+  // Function pointers specific to this handle's shared library
+  AOTInductorModelContainerCreateWithDeviceFunc create_with_device;
+  AOTInductorModelContainerDeleteFunc delete_container;
+  AOTInductorModelContainerGetNumInputsFunc get_num_inputs;
+  AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
+  AOTInductorModelContainerRunFunc run;
 };
 
 } // namespace aoti
diff --git a/backends/aoti/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
@@ -25,12 +25,9 @@ def define_common_targets():
 
     # AOTI model container functionality
     runtime.cxx_library(
-        name = "model_container",
-        srcs = [
-            "aoti_model_container.cpp",
-        ],
+        name = "delegate_handle",
         headers = [
-            "aoti_model_container.h",
+            "aoti_delegate_handle.h",
         ],
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
         link_whole = True,
@@ -44,7 +41,7 @@ def define_common_targets():
         ],
     )
 
-    # Common AOTI functionality (combining both common_shims and model_container)
+    # Common AOTI functionality (combining both common_shims and delegate_handle)
     runtime.cxx_library(
         name = "aoti_common",
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
@@ -53,6 +50,6 @@ def define_common_targets():
         visibility = ["@EXECUTORCH_CLIENTS"],
         exported_deps = [
             ":common_shims",
-            ":model_container",
+            ":delegate_handle",
         ],
     )
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -21,18 +21,18 @@
 #include <vector>
 
 // Include our shim layer headers
-#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 
 namespace executorch::backends::cuda {
 
-#define LOAD_SYMBOL(name, handle)                                \
-  do {                                                           \
-    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
-    ET_CHECK_OR_RETURN_ERROR(                                    \
-        name != nullptr, AccessFailed, "Failed to load " #name); \
+#define LOAD_SYMBOL(handle, member, name, so_handle)                        \
+  do {                                                                      \
+    handle->member = reinterpret_cast<name##Func>(dlsym(so_handle, #name)); \
+    ET_CHECK_OR_RETURN_ERROR(                                               \
+        handle->member != nullptr, AccessFailed, "Failed to load " #name);  \
   } while (0)
 
 using namespace std;
@@ -57,12 +57,31 @@ using executorch::runtime::etensor::Tensor;
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
-  Error register_shared_library_functions(void* so_handle) const {
-    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
-    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
-    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
-    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
-    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
+  Error load_function_pointers_into_handle(
+      void* so_handle,
+      AOTIDelegateHandle* handle) const {
+    LOAD_SYMBOL(
+        handle,
+        create_with_device,
+        AOTInductorModelContainerCreateWithDevice,
+        so_handle);
+
+    LOAD_SYMBOL(
+        handle, delete_container, AOTInductorModelContainerDelete, so_handle);
+
+    LOAD_SYMBOL(
+        handle,
+        get_num_inputs,
+        AOTInductorModelContainerGetNumInputs,
+        so_handle);
+
+    LOAD_SYMBOL(
+        handle,
+        get_num_outputs,
+        AOTInductorModelContainerGetNumOutputs,
+        so_handle);
+
+    LOAD_SYMBOL(handle, run, AOTInductorModelContainerRun, so_handle);
 
     return Error::Ok;
   }
@@ -135,19 +154,22 @@ class ET_EXPERIMENTAL CudaBackend final
 
     processed->Free();
 
-    // Register all shared library functions
-    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
+    // Create handle and load function pointers into it
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->so_path = so_path.string();
+
+    // Load function pointers specific to this handle's shared library
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        load_function_pointers_into_handle(so_handle, handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
 
-    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "cuda", nullptr));
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        handle->create_with_device(&container_handle, 1, "cuda", nullptr));
 
     ET_LOG(Info, "container_handle = %p", container_handle);
 
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
-    handle->so_handle = so_handle;
-    handle->so_path = so_path.string();
     handle->container_handle = container_handle;
 
     // Create a CUDA stream for asynchronous execution
@@ -165,20 +187,11 @@ class ET_EXPERIMENTAL CudaBackend final
       Span<EValue*> args) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
-    // Need to re-register all the symbols from the so_handle hosted by this
-    // CudaBackend instance. The reason is that these symbols are
-    // static/singleton across the whole process. When we share multiple methods
-    // (meaning multiple so_handle) in the same process, we need to re-register
-    // the symbols from the so_handle that is being used in this execution.
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        register_shared_library_functions(handle->so_handle));
-
     size_t n_inputs;
-    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+    handle->get_num_inputs(handle->container_handle, &n_inputs);
 
     size_t n_outputs;
-    AOTInductorModelContainerGetNumOutputs(
-        handle->container_handle, &n_outputs);
+    handle->get_num_outputs(handle->container_handle, &n_outputs);
 
     ET_CHECK_OR_RETURN_ERROR(
         n_inputs + n_outputs == args.size(),
@@ -261,7 +274,7 @@ class ET_EXPERIMENTAL CudaBackend final
       gpu_outputs[i] = gpu_output_handle;
     }
     // Run AOTI container with GPU tensors
-    AOTIRuntimeError error = AOTInductorModelContainerRun(
+    AOTIRuntimeError error = handle->run(
         handle->container_handle,
         gpu_inputs.data(), // Use GPU input tensors
         n_inputs,