Skip to content

Commit a90e907

Browse files
Support weight sharing in PTE (#13941)
### Summary 1. Support weight sharing with compile spec 2. Support weight sharing feature to llama export script and runner 3. Optimize llama performance
1 parent 61b0070 commit a90e907

File tree

14 files changed

+642
-204
lines changed

14 files changed

+642
-204
lines changed

backends/mediatek/preprocess.py

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44
# except in compliance with the License. See the license file in the root
55
# directory of this source tree for more details.
66

7+
import collections
78
import contextlib
89
import struct
910

10-
from typing import final, List
11+
from typing import Dict, final, List
1112

1213
import mtk_converter
1314
import mtk_neuron
1415
import torch
16+
from executorch.exir._serialize._named_data_store import NamedDataStore
1517
from executorch.exir.backend.backend_details import (
1618
BackendDetails,
1719
ExportedProgram,
@@ -20,6 +22,9 @@
2022
from executorch.exir.backend.compile_spec_schema import CompileSpec
2123

2224
SKIP_COMPILE_SPEC_KEYS = {"ImportForever"}
25+
EXTRACT_SHARED_BLOB_KEY = "ExtractSharedBlobKey"
26+
HEADER_SIZE = 13
27+
HEADER_VERSION = 1
2328
REQUIRED_COMPILE_SPEC_KEYS = {"platform-config"}
2429
SUPPORTED_PLATFORM_CONFIGS = {"mt6989", "mt6991"}
2530

@@ -41,6 +46,21 @@ def assert_default_dim_order(edge_graph_module: torch.fx.GraphModule) -> None:
4146
)
4247

4348

49+
def _pack_header(num_inputs, num_outputs, model_bytes_size):
50+
header_bytes = struct.pack(
51+
"<BIII", HEADER_VERSION, num_inputs, num_outputs, model_bytes_size
52+
)
53+
assert len(header_bytes) == HEADER_SIZE
54+
return header_bytes
55+
56+
57+
def _unpack_header(header_bytes):
58+
assert len(header_bytes) == HEADER_SIZE
59+
version, num_inputs, num_outputs, buffer_size = struct.unpack("<BIII", header_bytes)
60+
assert version == HEADER_VERSION
61+
return num_inputs, num_outputs, buffer_size
62+
63+
4464
@final
4565
class NeuropilotBackend(BackendDetails):
4666

@@ -90,8 +110,14 @@ def preprocess(
90110

91111
compile_options = ["--relax-fp32", "--opt=3"]
92112
for spec in module_compile_spec:
113+
# Special compile spec handling
93114
if spec.key in SKIP_COMPILE_SPEC_KEYS:
94115
continue
116+
if spec.key == EXTRACT_SHARED_BLOB_KEY:
117+
compile_options.append("--dla-opt=0")
118+
continue
119+
120+
# General compile spec handling
95121
if spec.value == b"":
96122
compile_options.append(f"--{spec.key}")
97123
else:
@@ -112,5 +138,77 @@ def preprocess(
112138

113139
num_inputs = len(input_names)
114140
num_outputs = len(output_names)
115-
header = struct.pack("<BIII", 1, num_inputs, num_outputs, len(model_bytes))
116-
return PreprocessResult(processed_bytes=bytes(header + model_bytes))
141+
header_bytes = _pack_header(num_inputs, num_outputs, len(model_bytes))
142+
return PreprocessResult(processed_bytes=bytes(header_bytes + model_bytes))
143+
144+
@classmethod
145+
def preprocess_multimethod(
146+
cls,
147+
edge_programs: Dict[str, List[ExportedProgram]],
148+
compile_specs: Dict[str, List[List[CompileSpec]]],
149+
) -> Dict[str, list[PreprocessResult]]:
150+
151+
# Follow the default behavior of `preprocess_multimethod`
152+
preprocess_results = {}
153+
for method_name, programs in edge_programs.items():
154+
assert (
155+
method_name in compile_specs
156+
), f"Error: missing compile specs for {method_name}"
157+
compile_specs_for_method = compile_specs[method_name]
158+
assert len(compile_specs_for_method) == len(
159+
programs
160+
), f"Error: method {method_name} has {len(programs)} partitions but only {len(compile_specs_for_method)}"
161+
results_for_method = []
162+
for program, compile_spec_for_program in zip(
163+
programs, compile_specs_for_method
164+
):
165+
preprocess_result = cls.preprocess(program, compile_spec_for_program)
166+
results_for_method.append(preprocess_result)
167+
168+
preprocess_results[method_name] = results_for_method
169+
170+
# Try extract shared data blob if necessary
171+
infos_dict = collections.defaultdict(list)
172+
models_dict = collections.defaultdict(list)
173+
result_dict = collections.defaultdict(list)
174+
for method_name, method_results in preprocess_results.items():
175+
for idx, result in enumerate(method_results):
176+
shared_blob_key = None
177+
for spec in compile_specs[method_name][idx]:
178+
if spec.key == EXTRACT_SHARED_BLOB_KEY:
179+
shared_blob_key = spec.value.decode("utf-8")
180+
181+
if shared_blob_key is None:
182+
continue
183+
184+
header_bytes = result.processed_bytes[:HEADER_SIZE]
185+
model_bytes = result.processed_bytes[HEADER_SIZE:]
186+
num_inputs, num_outputs, model_bytes_size = _unpack_header(header_bytes)
187+
assert len(model_bytes) == model_bytes_size
188+
infos_dict[shared_blob_key].append((num_inputs, num_outputs))
189+
models_dict[shared_blob_key].append(model_bytes)
190+
result_dict[shared_blob_key].append(result)
191+
192+
data_store_output_dict = {}
193+
for key, models in models_dict.items():
194+
ndm = NamedDataStore()
195+
blob, new_models = mtk_neuron.extract_shared_data(
196+
models, options="-e union"
197+
)
198+
ndm.add_named_data(key, bytes(blob))
199+
data_store_output_dict[key] = ndm.get_named_data_store_output()
200+
models.clear()
201+
models.extend(new_models)
202+
203+
for key, data_store_output in data_store_output_dict.items():
204+
for idx, (model_info, model_bytes) in enumerate(
205+
zip(infos_dict[key], models_dict[key])
206+
):
207+
num_inputs, num_outputs = model_info
208+
header_bytes = _pack_header(num_inputs, num_outputs, len(model_bytes))
209+
result_dict[key][idx].data_store_output = data_store_output
210+
result_dict[key][idx].processed_bytes = bytes(
211+
header_bytes + model_bytes
212+
)
213+
214+
return preprocess_results

backends/mediatek/runtime/NeuronBackend.cpp

Lines changed: 85 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
#include "NeuronPayloadHeader.h"
1313
#include "api/NeuronAdapter.h"
1414

15+
#include <executorch/runtime/executor/pte_data_map.h>
1516
#include "executorch/runtime/core/error.h"
16-
#include "executorch/runtime/core/exec_aten/util/dim_order_util.h"
1717

1818
#include <algorithm>
1919
#include <memory>
@@ -24,6 +24,7 @@ namespace executorch {
2424
namespace backends {
2525
namespace neuron {
2626

27+
using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
2728
using executorch::runtime::ArrayRef;
2829
using executorch::runtime::BackendExecutionContext;
2930
using executorch::runtime::BackendInitContext;
@@ -38,12 +39,22 @@ using executorch::runtime::Span;
3839

3940
const char kHighAddrKey[] = "HighAddr";
4041
const char kImportForeverKey[] = "ImportForever";
42+
const char kSharedWeightsKey[] = "ExtractSharedBlobKey";
4143

4244
Result<DelegateHandle*> NeuronBackend::init(
4345
BackendInitContext& context,
4446
FreeableBuffer* processed,
4547
ArrayRef<CompileSpec> compile_specs) const {
4648
NeuronDelegateSetting setting;
49+
MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
50+
NeuronExecuTorchDelegate* delegate =
51+
runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
52+
if (delegate == nullptr) {
53+
return Error::MemoryAllocationFailed;
54+
}
55+
56+
new (delegate) NeuronExecuTorchDelegate();
57+
4758
for (auto& compile_spec : compile_specs) {
4859
if (std::strcmp(compile_spec.key, kHighAddrKey) == 0) {
4960
setting.mHighAddr = *static_cast<char*>(compile_spec.value.buffer);
@@ -54,11 +65,62 @@ Result<DelegateHandle*> NeuronBackend::init(
5465
"NeuronBackend",
5566
"IsImportForever Enable : %d",
5667
setting.mImportForever);
68+
} else if (std::strcmp(compile_spec.key, kSharedWeightsKey) == 0) {
69+
setting.mSharedWeights = true;
70+
std::string shared_weights_key(
71+
static_cast<char*>(compile_spec.value.buffer),
72+
compile_spec.value.nbytes);
73+
LogInfo(
74+
"NeuronBackend",
75+
"SharedWeights Enabled for %s",
76+
shared_weights_key.c_str());
77+
std::shared_ptr<NeuronSharedWeights> neuron_shared_weights;
78+
if (neuron_shared_weights_cache_.find(shared_weights_key) !=
79+
neuron_shared_weights_cache_.end()) {
80+
neuron_shared_weights =
81+
neuron_shared_weights_cache_.at(shared_weights_key).lock();
82+
if (neuron_shared_weights) {
83+
LogInfo(
84+
"NeuronBackend",
85+
"Reusing cached shared weights with key %s",
86+
shared_weights_key.c_str());
87+
delegate->SetSharedWeights(neuron_shared_weights);
88+
continue;
89+
} else {
90+
LogInfo(
91+
"NeuronBackend",
92+
"Shared weights cache expired: %s",
93+
shared_weights_key.c_str());
94+
neuron_shared_weights_cache_.erase(shared_weights_key); // Expired
95+
}
96+
}
97+
const NamedDataMap* named_data_map = context.get_named_data_map();
98+
Result<FreeableBuffer> shared_weights =
99+
named_data_map->get_data(shared_weights_key.c_str());
100+
101+
if (shared_weights.ok()) {
102+
LogInfo(
103+
"NeuronBackend",
104+
"Loaded shared weights from named_data_map. Size: %zu",
105+
shared_weights.get().size());
106+
FreeableBuffer& buffer = shared_weights.get();
107+
neuron_shared_weights =
108+
std::make_shared<NeuronSharedWeights>(std::move(buffer));
109+
delegate->SetSharedWeights(neuron_shared_weights);
110+
neuron_shared_weights_cache_[shared_weights_key] =
111+
neuron_shared_weights;
112+
} else {
113+
LogError(
114+
"NeuronBackend",
115+
"Failed to load shared weights from named_data_map.");
116+
return Error::Internal;
117+
}
57118
} else {
58119
LogWarn("NeuronBackend", "unknown compile spec: %s", compile_spec.key);
59120
}
60121
}
61122
auto Payload = NeuronPayload(processed->data(), processed->size());
123+
62124
LogInfo(
63125
"NeuronBackend",
64126
"version %u, input %u, output %u, length %u, payload size: %zu",
@@ -68,19 +130,7 @@ Result<DelegateHandle*> NeuronBackend::init(
68130
Payload.Header.DataLen,
69131
processed->size());
70132

71-
MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
72-
NeuronExecuTorchDelegate* delegate =
73-
runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
74-
if (delegate == nullptr) {
75-
return Error::MemoryAllocationFailed;
76-
}
77-
78-
new (delegate) NeuronExecuTorchDelegate();
79-
80-
if (delegate == nullptr) {
81-
return nullptr;
82-
}
83-
auto res = delegate->LoadCompiledNetwork(Payload, setting);
133+
int res = delegate->LoadCompiledNetwork(Payload, setting);
84134
return res == NEURON_NO_ERROR ? delegate : nullptr;
85135
}
86136

@@ -112,21 +162,22 @@ Error NeuronExecuTorchDelegate::execute(
112162
return Error::InvalidState;
113163
};
114164

165+
ET_CHECK_OR_RETURN_ERROR(
166+
CheckDimOrder(args) == NEURON_NO_ERROR,
167+
Internal,
168+
"Expecting default dim_order but got a non default dim_order tensor input");
169+
170+
PrepareInputsOuputs(args);
171+
115172
auto allocator =
116173
dynamic_cast<neuron::BufferAllocator*>(context.get_temp_allocator());
117-
size_t inputCount = mInputSizes.size(), outputCount = mOutputSizes.size();
118-
119-
for (int i = 0; i < inputCount; i++) {
120-
auto tensor_in = args[i]->toTensor();
121-
ET_CHECK_OR_RETURN_ERROR(
122-
runtime::is_contiguous_dim_order(
123-
tensor_in.dim_order().data(), tensor_in.dim()),
124-
Internal,
125-
"Expecting default dim_order but got a non default dim_order tensor for external input %u",
126-
i);
127-
128-
auto data_ptr = args[i]->toTensor().data_ptr();
129-
auto data_size = args[i]->toTensor().nbytes();
174+
175+
size_t inputCount = mInputSizes.size() + neuron_shared_weights_.size();
176+
size_t outputCount = mOutputSizes.size();
177+
178+
for (size_t i = 0; i < inputCount; i++) {
179+
auto data_ptr = mPreparedInputs[i].data_ptr;
180+
auto data_size = mPreparedInputs[i].size;
130181
if (IsCached</*isInput=*/true>(i, data_ptr)) {
131182
continue;
132183
};
@@ -141,22 +192,20 @@ Error NeuronExecuTorchDelegate::execute(
141192
}
142193
}
143194

144-
for (int o = inputCount; o < inputCount + outputCount; o++) {
145-
auto data_ptr = args[o]->toTensor().data_ptr();
146-
auto data_size = args[o]->toTensor().nbytes();
147-
auto output_index = o - inputCount;
148-
if (IsCached</*isInput=*/false>(output_index, data_ptr)) {
195+
for (size_t o = 0; o < outputCount; o++) {
196+
auto data_ptr = mPreparedOutputs[o].data_ptr;
197+
auto data_size = mPreparedOutputs[o].size;
198+
if (IsCached</*isInput=*/false>(o, data_ptr)) {
149199
continue;
150200
};
151201
auto unit = allocator != nullptr ? allocator->Find(data_ptr) : nullptr;
152202
if (unit) {
153-
UpdateCache</*isInput=*/false>(output_index, data_ptr);
203+
UpdateCache</*isInput=*/false>(o, data_ptr);
154204
size_t offset = (char*)data_ptr - (char*)unit->GetAddress();
155205
mExecutor.SetInputOutputFromMemory</*isInput*/ false>(
156-
output_index, unit->GetNeuronMemory(), offset, data_size);
206+
o, unit->GetNeuronMemory(), offset, data_size);
157207
} else {
158-
mExecutor.SetInputOutput</*isInput=*/false>(
159-
output_index, data_ptr, data_size);
208+
mExecutor.SetInputOutput</*isInput=*/false>(o, data_ptr, data_size);
160209
}
161210
}
162211

0 commit comments

Comments
 (0)