12
12
#include " NeuronPayloadHeader.h"
13
13
#include " api/NeuronAdapter.h"
14
14
15
+ #include < executorch/runtime/executor/pte_data_map.h>
15
16
#include " executorch/runtime/core/error.h"
16
- #include " executorch/runtime/core/exec_aten/util/dim_order_util.h"
17
17
18
18
#include < algorithm>
19
19
#include < memory>
@@ -24,6 +24,7 @@ namespace executorch {
24
24
namespace backends {
25
25
namespace neuron {
26
26
27
+ using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
27
28
using executorch::runtime::ArrayRef;
28
29
using executorch::runtime::BackendExecutionContext;
29
30
using executorch::runtime::BackendInitContext;
@@ -38,12 +39,22 @@ using executorch::runtime::Span;
38
39
39
40
const char kHighAddrKey [] = " HighAddr" ;
40
41
const char kImportForeverKey [] = " ImportForever" ;
42
+ const char kSharedWeightsKey [] = " ExtractSharedBlobKey" ;
41
43
42
44
Result<DelegateHandle*> NeuronBackend::init (
43
45
BackendInitContext& context,
44
46
FreeableBuffer* processed,
45
47
ArrayRef<CompileSpec> compile_specs) const {
46
48
NeuronDelegateSetting setting;
49
+ MemoryAllocator* runtime_allocator = context.get_runtime_allocator ();
50
+ NeuronExecuTorchDelegate* delegate =
51
+ runtime_allocator->allocateInstance <NeuronExecuTorchDelegate>();
52
+ if (delegate == nullptr ) {
53
+ return Error::MemoryAllocationFailed;
54
+ }
55
+
56
+ new (delegate) NeuronExecuTorchDelegate ();
57
+
47
58
for (auto & compile_spec : compile_specs) {
48
59
if (std::strcmp (compile_spec.key , kHighAddrKey ) == 0 ) {
49
60
setting.mHighAddr = *static_cast <char *>(compile_spec.value .buffer );
@@ -54,11 +65,62 @@ Result<DelegateHandle*> NeuronBackend::init(
54
65
" NeuronBackend" ,
55
66
" IsImportForever Enable : %d" ,
56
67
setting.mImportForever );
68
+ } else if (std::strcmp (compile_spec.key , kSharedWeightsKey ) == 0 ) {
69
+ setting.mSharedWeights = true ;
70
+ std::string shared_weights_key (
71
+ static_cast <char *>(compile_spec.value .buffer ),
72
+ compile_spec.value .nbytes );
73
+ LogInfo (
74
+ " NeuronBackend" ,
75
+ " SharedWeights Enabled for %s" ,
76
+ shared_weights_key.c_str ());
77
+ std::shared_ptr<NeuronSharedWeights> neuron_shared_weights;
78
+ if (neuron_shared_weights_cache_.find (shared_weights_key) !=
79
+ neuron_shared_weights_cache_.end ()) {
80
+ neuron_shared_weights =
81
+ neuron_shared_weights_cache_.at (shared_weights_key).lock ();
82
+ if (neuron_shared_weights) {
83
+ LogInfo (
84
+ " NeuronBackend" ,
85
+ " Reusing cached shared weights with key %s" ,
86
+ shared_weights_key.c_str ());
87
+ delegate->SetSharedWeights (neuron_shared_weights);
88
+ continue ;
89
+ } else {
90
+ LogInfo (
91
+ " NeuronBackend" ,
92
+ " Shared weights cache expired: %s" ,
93
+ shared_weights_key.c_str ());
94
+ neuron_shared_weights_cache_.erase (shared_weights_key); // Expired
95
+ }
96
+ }
97
+ const NamedDataMap* named_data_map = context.get_named_data_map ();
98
+ Result<FreeableBuffer> shared_weights =
99
+ named_data_map->get_data (shared_weights_key.c_str ());
100
+
101
+ if (shared_weights.ok ()) {
102
+ LogInfo (
103
+ " NeuronBackend" ,
104
+ " Loaded shared weights from named_data_map. Size: %zu" ,
105
+ shared_weights.get ().size ());
106
+ FreeableBuffer& buffer = shared_weights.get ();
107
+ neuron_shared_weights =
108
+ std::make_shared<NeuronSharedWeights>(std::move (buffer));
109
+ delegate->SetSharedWeights (neuron_shared_weights);
110
+ neuron_shared_weights_cache_[shared_weights_key] =
111
+ neuron_shared_weights;
112
+ } else {
113
+ LogError (
114
+ " NeuronBackend" ,
115
+ " Failed to load shared weights from named_data_map." );
116
+ return Error::Internal;
117
+ }
57
118
} else {
58
119
LogWarn (" NeuronBackend" , " unknown compile spec: %s" , compile_spec.key );
59
120
}
60
121
}
61
122
auto Payload = NeuronPayload (processed->data (), processed->size ());
123
+
62
124
LogInfo (
63
125
" NeuronBackend" ,
64
126
" version %u, input %u, output %u, length %u, payload size: %zu" ,
@@ -68,19 +130,7 @@ Result<DelegateHandle*> NeuronBackend::init(
68
130
Payload.Header .DataLen ,
69
131
processed->size ());
70
132
71
- MemoryAllocator* runtime_allocator = context.get_runtime_allocator ();
72
- NeuronExecuTorchDelegate* delegate =
73
- runtime_allocator->allocateInstance <NeuronExecuTorchDelegate>();
74
- if (delegate == nullptr ) {
75
- return Error::MemoryAllocationFailed;
76
- }
77
-
78
- new (delegate) NeuronExecuTorchDelegate ();
79
-
80
- if (delegate == nullptr ) {
81
- return nullptr ;
82
- }
83
- auto res = delegate->LoadCompiledNetwork (Payload, setting);
133
+ int res = delegate->LoadCompiledNetwork (Payload, setting);
84
134
return res == NEURON_NO_ERROR ? delegate : nullptr ;
85
135
}
86
136
@@ -112,21 +162,22 @@ Error NeuronExecuTorchDelegate::execute(
112
162
return Error::InvalidState;
113
163
};
114
164
165
+ ET_CHECK_OR_RETURN_ERROR (
166
+ CheckDimOrder (args) == NEURON_NO_ERROR,
167
+ Internal,
168
+ " Expecting default dim_order but got a non default dim_order tensor input" );
169
+
170
+ PrepareInputsOuputs (args);
171
+
115
172
auto allocator =
116
173
dynamic_cast <neuron::BufferAllocator*>(context.get_temp_allocator ());
117
- size_t inputCount = mInputSizes .size (), outputCount = mOutputSizes .size ();
118
-
119
- for (int i = 0 ; i < inputCount; i++) {
120
- auto tensor_in = args[i]->toTensor ();
121
- ET_CHECK_OR_RETURN_ERROR (
122
- runtime::is_contiguous_dim_order (
123
- tensor_in.dim_order ().data (), tensor_in.dim ()),
124
- Internal,
125
- " Expecting default dim_order but got a non default dim_order tensor for external input %u" ,
126
- i);
127
-
128
- auto data_ptr = args[i]->toTensor ().data_ptr ();
129
- auto data_size = args[i]->toTensor ().nbytes ();
174
+
175
+ size_t inputCount = mInputSizes .size () + neuron_shared_weights_.size ();
176
+ size_t outputCount = mOutputSizes .size ();
177
+
178
+ for (size_t i = 0 ; i < inputCount; i++) {
179
+ auto data_ptr = mPreparedInputs [i].data_ptr ;
180
+ auto data_size = mPreparedInputs [i].size ;
130
181
if (IsCached</* isInput=*/ true >(i, data_ptr)) {
131
182
continue ;
132
183
};
@@ -141,22 +192,20 @@ Error NeuronExecuTorchDelegate::execute(
141
192
}
142
193
}
143
194
144
- for (int o = inputCount; o < inputCount + outputCount; o++) {
145
- auto data_ptr = args[o]->toTensor ().data_ptr ();
146
- auto data_size = args[o]->toTensor ().nbytes ();
147
- auto output_index = o - inputCount;
148
- if (IsCached</* isInput=*/ false >(output_index, data_ptr)) {
195
+ for (size_t o = 0 ; o < outputCount; o++) {
196
+ auto data_ptr = mPreparedOutputs [o].data_ptr ;
197
+ auto data_size = mPreparedOutputs [o].size ;
198
+ if (IsCached</* isInput=*/ false >(o, data_ptr)) {
149
199
continue ;
150
200
};
151
201
auto unit = allocator != nullptr ? allocator->Find (data_ptr) : nullptr ;
152
202
if (unit) {
153
- UpdateCache</* isInput=*/ false >(output_index , data_ptr);
203
+ UpdateCache</* isInput=*/ false >(o , data_ptr);
154
204
size_t offset = (char *)data_ptr - (char *)unit->GetAddress ();
155
205
mExecutor .SetInputOutputFromMemory </* isInput*/ false >(
156
- output_index , unit->GetNeuronMemory (), offset, data_size);
206
+ o , unit->GetNeuronMemory (), offset, data_size);
157
207
} else {
158
- mExecutor .SetInputOutput </* isInput=*/ false >(
159
- output_index, data_ptr, data_size);
208
+ mExecutor .SetInputOutput </* isInput=*/ false >(o, data_ptr, data_size);
160
209
}
161
210
}
162
211
0 commit comments