use ptd pipeline on .so file

Gasoonjia · Gasoonjia · commit 518f1345ba97 · 2025-09-09T16:32:38.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -107,6 +107,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_SKIP_BUILD_RPATH OFF)
 # Don't use the install-rpath during the build phase
 set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
+
 # Automatically add all linked folders that are NOT in the build directory to
 # the rpath (per library?)
 #
@@ -984,6 +985,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  # Add flat tensor extension if it's built
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -7,13 +7,13 @@
 import contextlib
 import copy
 import os
-import shutil
 import typing
 
 from subprocess import check_call
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
@@ -72,6 +72,7 @@ def preprocess(
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         print("entering  the lowerable parts in AotiBackend.preprocess....")
+        named_data_store = NamedDataStore()
 
         # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
@@ -88,6 +89,7 @@ def preprocess(
         options: dict[str, typing.Any] = {
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
+            "aot_inductor.force_mmap_weights": False,
             "max_autotune": True,
             "max_autotune_gemm_backends": "TRITON",
             "max_autotune_conv_backends": "TRITON",
@@ -111,4 +113,13 @@ def preprocess(
 
         print("so_path", so_path)
 
-        return PreprocessResult(so_path.encode("utf-8"))
+        with open(so_path, "rb") as f:
+            so_data = f.read()
+
+        named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+
+        return PreprocessResult(
+            processed_bytes=b"",
+            debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
+        )
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
@@ -51,6 +51,7 @@ using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
@@ -69,15 +70,34 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   // Once per loaded binary blob
   Result<DelegateHandle*> init(
       BackendInitContext& context,
-      FreeableBuffer* processed, // This will be the buffer from aoti_backend
+      FreeableBuffer* processed, // This will be a empty buffer
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    const char* so_path = static_cast<const char*>(processed->data());
+    // const char* so_path = static_cast<const char*>(processed->data());
 
-    printf("so path: %s\n", so_path);
+    // printf("so path: %s\n", so_path);
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+
+    std::string so_path = "/tmp/test.so";
+    std::string so_blob_key = "so_blob";
+
+    Result<FreeableBuffer> aoti_cuda_buffer =
+        named_data_map->get_data(aoti_cuda_blob_name.c_str());
+
+    // Create a temporary file
+    std::ofstream outfile(so_path.c_str(), std::ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    outfile.write(
+        (char*)aoti_cuda_buffer->data(),
+        sizeof(void*) * aoti_cuda_buffer->size());
+
+    // Finish writing the file to disk
+    outfile.close();
 
     // Load the ELF using dlopen
-    void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
     if (so_handle == nullptr) {
       std::cout << dlerror() << std::endl;
       return Error::AccessFailed;
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,10 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(
+    data_path,
+    "",
+    "Path to external tensor data file (.ptd format). Optional.");
 DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
@@ -60,6 +65,7 @@ DEFINE_int32(
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -242,8 +248,43 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   EventTraceManager tracer;
+
+  // Handle optional external tensor data loading
+  std::unique_ptr<FileDataLoader> data_loader;
+  std::unique_ptr<FlatTensorDataMap> data_map;
+
+  if (!FLAGS_data_path.empty()) {
+    ET_LOG(
+        Info, "Loading external tensor data from %s", FLAGS_data_path.c_str());
+
+    // Create FileDataLoader for the PTD file
+    Result<FileDataLoader> data_loader_result =
+        FileDataLoader::from(FLAGS_data_path.c_str());
+    ET_CHECK_MSG(
+        data_loader_result.ok(),
+        "Failed to create FileDataLoader for data path %s: 0x%" PRIx32,
+        FLAGS_data_path.c_str(),
+        (uint32_t)data_loader_result.error());
+
+    data_loader =
+        std::make_unique<FileDataLoader>(std::move(data_loader_result.get()));
+
+    // Create FlatTensorDataMap from the loaded blob
+    Result<FlatTensorDataMap> data_map_result =
+        FlatTensorDataMap::load(data_loader.get());
+    ET_CHECK_MSG(
+        data_map_result.ok(),
+        "Failed to load FlatTensorDataMap from %s: 0x%" PRIx32,
+        FLAGS_data_path.c_str(),
+        (uint32_t)data_map_result.error());
+
+    data_map =
+        std::make_unique<FlatTensorDataMap>(std::move(data_map_result.get()));
+    ET_LOG(Info, "External tensor data loaded successfully");
+  }
+
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name, &memory_manager, tracer.get_event_tracer(), data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
@@ -141,6 +141,8 @@ build_runtime() {
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Debug \
               -DCMAKE_BUILD_TYPE=Debug \
+              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
               ..
     else
         echo "Building with release configuration..."
@@ -149,6 +151,8 @@ build_runtime() {
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Info \
               -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
               ..
     fi
 
@@ -158,7 +162,7 @@ build_runtime() {
 
 run_inference() {
     echo "Running executor_runner with debug logging enabled..."
-    ./cmake-out/executor_runner --model_path aoti_model.pte
+    ./cmake-out/executor_runner --model_path aoti_model.pte --data_path aoti_cuda_blob.ptd
 }
 
 compare_outputs() {
diff --git a/export_aoti.py b/export_aoti.py