Add .ptd support to portable executor runner (pytorch#14833)

larryliu0820 · web-flow · commit d8e07bd20c84 · 2025-10-07T07:55:15.000-07:00
This pull request enhances the `executor_runner` example by adding
support for loading and using `.ptd` (portable tensor data) files. This
enables the runner to ingest pre-serialized tensor data, improving
flexibility for model input handling. The changes include updates to
both build configuration and the main runner logic.

**Support for .ptd file loading and usage:**

* Added a new command-line flag `data_path` to specify the path to a
`.ptd` data file in `executor_runner.cpp` and integrated logic to load
this file and parse its contents using `FlatTensorDataMap`.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R54)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R177-R204)
* Updated the runner to pass the loaded tensor data map to the model
method loader, allowing methods to access pre-loaded input data.

**Build and dependency updates:**

* Included `flat_tensor_data_map` as a dependency in both the Bazel
build targets and CMake build configuration to ensure the new
functionality is available during compilation.
[[1]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR22)
[[2]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR42)
[[3]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR1024-R1026)
* Added the necessary header include for `flat_tensor_data_map` in
`executor_runner.cpp` and updated the relevant namespace usage.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R29)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R77)

## Test Plan:

Tested with .pte and .ptd for CUDA backend:

```
python -m executorch.examples.cuda.scripts.export --model_name linear --output_dir ./
```

Make sure we have `linear.pte` and `aoti_cuda_blob.ptd`.

Build executor runner with the following options:

```
cmake -DCMAKE_BUILD_TYPE=Debug -DEXECUTORCH_BUILD_CUDA=ON -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON -S. -Bcmake-out
```

Then do:

```
cmake --build cmake-out -j8
```

Then we can run:

```
cmake-out/executor_runner --model_path linear.pte --ptd_path aoti_cuda_blob.ptd
I 00:00:00.000594 executorch:executor_runner.cpp:189] PTD file aoti_cuda_blob.ptd is loaded.
I 00:00:00.000671 executorch:executor_runner.cpp:199] PTD data map created with 1 keys.
I 00:00:00.000749 executorch:executor_runner.cpp:249] Model file linear.pte is loaded.
I 00:00:00.000758 executorch:executor_runner.cpp:258] Using method forward
I 00:00:00.000770 executorch:executor_runner.cpp:309] Setting up planned buffer 0, size 96.
I 00:00:00.002908 executorch:cuda_backend.cpp:140] Writing 394624 bytes to /tmp/linear_so_blob844427.so
I 00:00:00.324783 executorch:cuda_backend.cpp:174] container_handle = 0x26a71b0
I 00:00:00.324867 executorch:executor_runner.cpp:337] Method loaded.
I 00:00:00.325796 executorch:cuda_backend.cpp:249] Inputs copied to GPU
I 00:00:00.325829 executorch:cuda_backend.cpp:278] Outputs created on GPU
E 00:00:00.326623 executorch:memory.cpp:286] Cannot delete null tensor
I 00:00:00.326678 executorch:executor_runner.cpp:374] Model executed successfully 1 time(s) in 1.777041 ms.
I 00:00:00.326691 executorch:executor_runner.cpp:383] 1 outputs: 
OutputX 0: tensor(sizes=[3, 3], [-0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356])
E 00:00:00.328474 executorch:memory.cpp:299] Didn't find tensor 0x699a3d0
```
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -48,22 +48,25 @@ prepare_artifacts_upload() {
   fi
 }
 
+
 build_cmake_executor_runner() {
   local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
   mkdir ${CMAKE_OUTPUT_DIR}
+  # Common options:
+  COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
   if [[ "$backend_string_select" == "XNNPACK" ]]; then
     echo "Backend $backend_string_select selected"
-    (cd ${CMAKE_OUTPUT_DIR} \
-      && cmake -DCMAKE_BUILD_TYPE=Release \
+    cmake -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
   fi
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
   clean_executorch_install_folders
   mkdir "${CMAKE_OUTPUT_DIR}"
 
-  pushd "${CMAKE_OUTPUT_DIR}" || return
   if [[ $1 == "Debug" ]]; then
       CXXFLAGS="-fsanitize=address,undefined"
   else
       CXXFLAGS=""
   fi
-  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
-  popd || return
+  CXXFLAGS="$CXXFLAGS" retry cmake \
+    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+    -DCMAKE_BUILD_TYPE="${1:-Release}" \
+    -B${CMAKE_OUTPUT_DIR} .
 
   if [ "$(uname)" == "Darwin" ]; then
     CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1021,6 +1021,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
@@ -117,8 +117,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 add_executable(custom_ops_executor_runner ${_executor_runner__srcs})
 target_link_libraries(
-  custom_ops_executor_runner custom_ops_lib executorch extension_evalue_util
-  extension_runner_util gflags
+  custom_ops_executor_runner
+  custom_ops_lib
+  executorch
+  extension_evalue_util
+  extension_runner_util
+  gflags
+  extension_data_loader
+  extension_flat_tensor
 )
 target_compile_options(
   custom_ops_executor_runner PUBLIC ${_common_compile_options}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,7 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
 DEFINE_string(
     output_file,
@@ -72,6 +74,7 @@ DEFINE_int32(
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -171,6 +174,34 @@ int main(int argc, char** argv) {
       "FileDataLoader::from() failed: 0x%" PRIx32,
       (uint32_t)loader.error());
 
+  // Load .ptd file if provided
+  std::unique_ptr<FileDataLoader> ptd_loader;
+  std::unique_ptr<FlatTensorDataMap> ptd_data_map;
+  if (!FLAGS_data_path.empty()) {
+    const char* data_path = FLAGS_data_path.c_str();
+    Result<FileDataLoader> ptd_loader_result = FileDataLoader::from(data_path);
+    ET_CHECK_MSG(
+        ptd_loader_result.ok(),
+        "FileDataLoader::from() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_loader_result.error());
+    ptd_loader =
+        std::make_unique<FileDataLoader>(std::move(ptd_loader_result.get()));
+    ET_LOG(Info, "PTD file %s is loaded.", data_path);
+
+    Result<FlatTensorDataMap> ptd_data_map_result =
+        FlatTensorDataMap::load(ptd_loader.get());
+    ET_CHECK_MSG(
+        ptd_data_map_result.ok(),
+        "FlatTensorDataMap::load() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_data_map_result.error());
+    ptd_data_map = std::make_unique<FlatTensorDataMap>(
+        std::move(ptd_data_map_result.get()));
+    ET_LOG(
+        Info,
+        "PTD data map created with %" PRIu64 " keys.",
+        static_cast<uint64_t>(ptd_data_map->get_num_keys().get()));
+  }
+
   std::vector<std::string> inputs_storage;
   std::vector<std::pair<char*, size_t>> input_buffers;
 
@@ -294,7 +325,10 @@ int main(int argc, char** argv) {
   //
   EventTraceManager tracer;
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name,
+      &memory_manager,
+      tracer.get_event_tracer(),
+      ptd_data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/examples/selective_build/advanced/CMakeLists.txt b/examples/selective_build/advanced/CMakeLists.txt
@@ -139,7 +139,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags ${selected_kernel_target}
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          extension_flat_tensor
+          extension_data_loader
+          ${selected_kernel_target}
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/selective_build/basic/CMakeLists.txt b/examples/selective_build/basic/CMakeLists.txt
@@ -71,7 +71,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags executorch_kernels
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          executorch_kernels
+          extension_data_loader
+          extension_flat_tensor
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -55,7 +55,7 @@ Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
   if (named_data == nullptr) {
     return Error::NotFound;
   }
-  for (int i = 0; i < named_data->size(); i++) {
+  for (flatbuffers::uoffset_t i = 0; i < named_data->size(); ++i) {
     if (key.size() == named_data->Get(i)->key()->size() &&
         std::strncmp(
             named_data->Get(i)->key()->c_str(),
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.cpp b/extension/flat_tensor/serialize/flat_tensor_header.cpp
@@ -14,7 +14,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
+#if defined(__clang__)
 #pragma clang diagnostic ignored "-Wdeprecated"
+#endif
 
 namespace executorch {
 using runtime::Error;
diff --git a/tools/cmake/preset/arm_baremetal.cmake b/tools/cmake/preset/arm_baremetal.cmake
@@ -5,6 +5,8 @@
 
 set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}")
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
 set_overridable_option(EXECUTORCH_BUILD_ARM_BAREMETAL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
@@ -18,7 +20,6 @@ define_overridable_option(
 if("${EXECUTORCH_BUILD_ARM_ETDUMP}")
   set(EXECUTORCH_BUILD_DEVTOOLS ON)
   set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
   set(FLATCC_ALLOW_WERROR OFF)
 else()
   set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
@@ -67,11 +67,11 @@ define_overridable_option(
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension" BOOL OFF

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(`
`55`	`55`	`if (named_data == nullptr) {`
`56`	`56`	`return Error::NotFound;`
`57`	`57`	`}`
`58`		`- for (int i = 0; i < named_data->size(); i++) {`
	`58`	`+ for (flatbuffers::uoffset_t i = 0; i < named_data->size(); ++i) {`
`59`	`59`	`if (key.size() == named_data->Get(i)->key()->size() &&`
`60`	`60`	`std::strncmp(`
`61`	`61`	`named_data->Get(i)->key()->c_str(),`