Skip to content

Commit 518f134

Browse files
committed
use ptd pipeline on .so file
1 parent e5dc5ef commit 518f134

File tree

7 files changed

+209
-26
lines changed

7 files changed

+209
-26
lines changed

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
107107
set(CMAKE_SKIP_BUILD_RPATH OFF)
108108
# Don't use the install-rpath during the build phase
109109
set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
110+
110111
# Automatically add all linked folders that are NOT in the build directory to
111112
# the rpath (per library?)
112113
#
@@ -984,6 +985,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
984985
extension_runner_util gflags executorch_backends
985986
)
986987

988+
# Add flat tensor extension if it's built
989+
if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
990+
list(APPEND _executor_runner_libs extension_flat_tensor)
991+
endif()
992+
987993
if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
988994
list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
989995
elseif(EXECUTORCH_BUILD_CADENCE)

backends/aoti/aoti_backend.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
import contextlib
88
import copy
99
import os
10-
import shutil
1110
import typing
1211

1312
from subprocess import check_call
1413
from typing import Any, Dict, final, List, Optional, Set
1514

1615
import torch
16+
from executorch.exir._serialize._named_data_store import NamedDataStore
1717
from executorch.exir.backend.backend_details import (
1818
BackendDetails,
1919
ExportedProgram,
@@ -72,6 +72,7 @@ def preprocess(
7272
compile_specs: List[CompileSpec],
7373
) -> PreprocessResult:
7474
print("entering the lowerable parts in AotiBackend.preprocess....")
75+
named_data_store = NamedDataStore()
7576

7677
# print("here", edge_program.example_inputs)
7778
copy_edge_program = copy.deepcopy(edge_program)
@@ -88,6 +89,7 @@ def preprocess(
8889
options: dict[str, typing.Any] = {
8990
"aot_inductor.package_constants_in_so": True,
9091
"aot_inductor.output_path": output_path,
92+
"aot_inductor.force_mmap_weights": False,
9193
"max_autotune": True,
9294
"max_autotune_gemm_backends": "TRITON",
9395
"max_autotune_conv_backends": "TRITON",
@@ -111,4 +113,13 @@ def preprocess(
111113

112114
print("so_path", so_path)
113115

114-
return PreprocessResult(so_path.encode("utf-8"))
116+
with open(so_path, "rb") as f:
117+
so_data = f.read()
118+
119+
named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
120+
121+
return PreprocessResult(
122+
processed_bytes=b"",
123+
debug_handle_map={},
124+
data_store_output=named_data_store.get_named_data_store_output(),
125+
)

backends/aoti/runtime/aoti_backend.cpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ using executorch::runtime::Error;
5151
using executorch::runtime::EValue;
5252
using executorch::runtime::FreeableBuffer;
5353
using executorch::runtime::MemoryAllocator;
54+
using executorch::runtime::NamedDataMap;
5455
using executorch::runtime::Result;
5556
using executorch::runtime::Span;
5657
using executorch::runtime::etensor::Tensor;
@@ -69,15 +70,34 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
6970
// Once per loaded binary blob
7071
Result<DelegateHandle*> init(
7172
BackendInitContext& context,
72-
FreeableBuffer* processed, // This will be the buffer from aoti_backend
73+
FreeableBuffer* processed, // This will be a empty buffer
7374
ArrayRef<CompileSpec> compile_specs // This will be my empty list
7475
) const override {
75-
const char* so_path = static_cast<const char*>(processed->data());
76+
// const char* so_path = static_cast<const char*>(processed->data());
7677

77-
printf("so path: %s\n", so_path);
78+
// printf("so path: %s\n", so_path);
79+
80+
const NamedDataMap* named_data_map = context.get_named_data_map();
81+
82+
std::string so_path = "/tmp/test.so";
83+
std::string so_blob_key = "so_blob";
84+
85+
Result<FreeableBuffer> aoti_cuda_buffer =
86+
named_data_map->get_data(aoti_cuda_blob_name.c_str());
87+
88+
// Create a temporary file
89+
std::ofstream outfile(so_path.c_str(), std::ios::binary);
90+
91+
// Write the ELF buffer to the temporary file
92+
outfile.write(
93+
(char*)aoti_cuda_buffer->data(),
94+
sizeof(void*) * aoti_cuda_buffer->size());
95+
96+
// Finish writing the file to disk
97+
outfile.close();
7898

7999
// Load the ELF using dlopen
80-
void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
100+
void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
81101
if (so_handle == nullptr) {
82102
std::cout << dlerror() << std::endl;
83103
return Error::AccessFailed;

examples/portable/executor_runner/executor_runner.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include <executorch/extension/data_loader/file_data_loader.h>
2828
#include <executorch/extension/evalue_util/print_evalue.h>
29+
#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
2930
#include <executorch/extension/runner_util/inputs.h>
3031
#include <executorch/runtime/core/event_tracer.h>
3132
#include <executorch/runtime/executor/method.h>
@@ -50,6 +51,10 @@ DEFINE_string(
5051
model_path,
5152
"model.pte",
5253
"Model serialized in flatbuffer format.");
54+
DEFINE_string(
55+
data_path,
56+
"",
57+
"Path to external tensor data file (.ptd format). Optional.");
5358
DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
5459
#ifdef ET_EVENT_TRACER_ENABLED
5560
DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
@@ -60,6 +65,7 @@ DEFINE_int32(
6065
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
6166

6267
using executorch::extension::FileDataLoader;
68+
using executorch::extension::FlatTensorDataMap;
6369
using executorch::runtime::Error;
6470
using executorch::runtime::EValue;
6571
using executorch::runtime::EventTracer;
@@ -242,8 +248,43 @@ int main(int argc, char** argv) {
242248
// be used by a single thread at at time, but it can be reused.
243249
//
244250
EventTraceManager tracer;
251+
252+
// Handle optional external tensor data loading
253+
std::unique_ptr<FileDataLoader> data_loader;
254+
std::unique_ptr<FlatTensorDataMap> data_map;
255+
256+
if (!FLAGS_data_path.empty()) {
257+
ET_LOG(
258+
Info, "Loading external tensor data from %s", FLAGS_data_path.c_str());
259+
260+
// Create FileDataLoader for the PTD file
261+
Result<FileDataLoader> data_loader_result =
262+
FileDataLoader::from(FLAGS_data_path.c_str());
263+
ET_CHECK_MSG(
264+
data_loader_result.ok(),
265+
"Failed to create FileDataLoader for data path %s: 0x%" PRIx32,
266+
FLAGS_data_path.c_str(),
267+
(uint32_t)data_loader_result.error());
268+
269+
data_loader =
270+
std::make_unique<FileDataLoader>(std::move(data_loader_result.get()));
271+
272+
// Create FlatTensorDataMap from the loaded blob
273+
Result<FlatTensorDataMap> data_map_result =
274+
FlatTensorDataMap::load(data_loader.get());
275+
ET_CHECK_MSG(
276+
data_map_result.ok(),
277+
"Failed to load FlatTensorDataMap from %s: 0x%" PRIx32,
278+
FLAGS_data_path.c_str(),
279+
(uint32_t)data_map_result.error());
280+
281+
data_map =
282+
std::make_unique<FlatTensorDataMap>(std::move(data_map_result.get()));
283+
ET_LOG(Info, "External tensor data loaded successfully");
284+
}
285+
245286
Result<Method> method = program->load_method(
246-
method_name, &memory_manager, tracer.get_event_tracer());
287+
method_name, &memory_manager, tracer.get_event_tracer(), data_map.get());
247288
ET_CHECK_MSG(
248289
method.ok(),
249290
"Loading of method %s failed with status 0x%" PRIx32,

examples/portable/executor_runner/targets.bzl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def define_common_targets():
1919
"//executorch/devtools/etdump:etdump_flatcc",
2020
"//executorch/extension/data_loader:file_data_loader",
2121
"//executorch/extension/evalue_util:print_evalue",
22+
"//executorch/extension/flat_tensor:flat_tensor_data_map",
2223
"//executorch/extension/runner_util:inputs",
2324
],
2425
external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
3839
"//executorch/runtime/executor:program",
3940
"//executorch/extension/data_loader:file_data_loader",
4041
"//executorch/extension/evalue_util:print_evalue",
42+
"//executorch/extension/flat_tensor:flat_tensor_data_map",
4143
"//executorch/extension/runner_util:inputs",
4244
"//executorch/extension/threadpool:cpuinfo_utils",
4345
"//executorch/extension/threadpool:threadpool",

export_and_run_aoti.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ build_runtime() {
141141
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
142142
-DEXECUTORCH_LOG_LEVEL=Debug \
143143
-DCMAKE_BUILD_TYPE=Debug \
144+
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
145+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
144146
..
145147
else
146148
echo "Building with release configuration..."
@@ -149,6 +151,8 @@ build_runtime() {
149151
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
150152
-DEXECUTORCH_LOG_LEVEL=Info \
151153
-DCMAKE_BUILD_TYPE=Release \
154+
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
155+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
152156
..
153157
fi
154158

@@ -158,7 +162,7 @@ build_runtime() {
158162

159163
run_inference() {
160164
echo "Running executor_runner with debug logging enabled..."
161-
./cmake-out/executor_runner --model_path aoti_model.pte
165+
./cmake-out/executor_runner --model_path aoti_model.pte --data_path aoti_cuda_blob.ptd
162166
}
163167

164168
compare_outputs() {

0 commit comments

Comments
 (0)