1212#include " NeuronPayloadHeader.h"
1313#include " api/NeuronAdapter.h"
1414
15+ #include < executorch/runtime/executor/pte_data_map.h>
1516#include " executorch/runtime/core/error.h"
16- #include " executorch/runtime/core/exec_aten/util/dim_order_util.h"
1717
1818#include < algorithm>
1919#include < memory>
@@ -24,6 +24,7 @@ namespace executorch {
2424namespace backends {
2525namespace neuron {
2626
27+ using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
2728using executorch::runtime::ArrayRef;
2829using executorch::runtime::BackendExecutionContext;
2930using executorch::runtime::BackendInitContext;
@@ -38,12 +39,22 @@ using executorch::runtime::Span;
3839
3940const char kHighAddrKey [] = " HighAddr" ;
4041const char kImportForeverKey [] = " ImportForever" ;
42+ const char kSharedWeightsKey [] = " ExtractSharedBlobKey" ;
4143
4244Result<DelegateHandle*> NeuronBackend::init (
4345 BackendInitContext& context,
4446 FreeableBuffer* processed,
4547 ArrayRef<CompileSpec> compile_specs) const {
4648 NeuronDelegateSetting setting;
49+ MemoryAllocator* runtime_allocator = context.get_runtime_allocator ();
50+ NeuronExecuTorchDelegate* delegate =
51+ runtime_allocator->allocateInstance <NeuronExecuTorchDelegate>();
52+ if (delegate == nullptr ) {
53+ return Error::MemoryAllocationFailed;
54+ }
55+
56+ new (delegate) NeuronExecuTorchDelegate ();
57+
4758 for (auto & compile_spec : compile_specs) {
4859 if (std::strcmp (compile_spec.key , kHighAddrKey ) == 0 ) {
4960 setting.mHighAddr = *static_cast <char *>(compile_spec.value .buffer );
@@ -54,11 +65,62 @@ Result<DelegateHandle*> NeuronBackend::init(
5465 " NeuronBackend" ,
5566 " IsImportForever Enable : %d" ,
5667 setting.mImportForever );
68+ } else if (std::strcmp (compile_spec.key , kSharedWeightsKey ) == 0 ) {
69+ setting.mSharedWeights = true ;
70+ std::string shared_weights_key (
71+ static_cast <char *>(compile_spec.value .buffer ),
72+ compile_spec.value .nbytes );
73+ LogInfo (
74+ " NeuronBackend" ,
75+ " SharedWeights Enabled for %s" ,
76+ shared_weights_key.c_str ());
77+ std::shared_ptr<NeuronSharedWeights> neuron_shared_weights;
78+ if (neuron_shared_weights_cache_.find (shared_weights_key) !=
79+ neuron_shared_weights_cache_.end ()) {
80+ neuron_shared_weights =
81+ neuron_shared_weights_cache_.at (shared_weights_key).lock ();
82+ if (neuron_shared_weights) {
83+ LogInfo (
84+ " NeuronBackend" ,
85+ " Reusing cached shared weights with key %s" ,
86+ shared_weights_key.c_str ());
87+ delegate->SetSharedWeights (neuron_shared_weights);
88+ continue ;
89+ } else {
90+ LogInfo (
91+ " NeuronBackend" ,
92+ " Shared weights cache expired: %s" ,
93+ shared_weights_key.c_str ());
94+ neuron_shared_weights_cache_.erase (shared_weights_key); // Expired
95+ }
96+ }
97+ const NamedDataMap* named_data_map = context.get_named_data_map ();
98+ Result<FreeableBuffer> shared_weights =
99+ named_data_map->get_data (shared_weights_key.c_str ());
100+
101+ if (shared_weights.ok ()) {
102+ LogInfo (
103+ " NeuronBackend" ,
104+ " Loaded shared weights from named_data_map. Size: %zu" ,
105+ shared_weights.get ().size ());
106+ FreeableBuffer& buffer = shared_weights.get ();
107+ neuron_shared_weights =
108+ std::make_shared<NeuronSharedWeights>(std::move (buffer));
109+ delegate->SetSharedWeights (neuron_shared_weights);
110+ neuron_shared_weights_cache_[shared_weights_key] =
111+ neuron_shared_weights;
112+ } else {
113+ LogError (
114+ " NeuronBackend" ,
115+ " Failed to load shared weights from named_data_map." );
116+ return Error::Internal;
117+ }
57118 } else {
58119 LogWarn (" NeuronBackend" , " unknown compile spec: %s" , compile_spec.key );
59120 }
60121 }
61122 auto Payload = NeuronPayload (processed->data (), processed->size ());
123+
62124 LogInfo (
63125 " NeuronBackend" ,
64126 " version %u, input %u, output %u, length %u, payload size: %zu" ,
@@ -68,19 +130,7 @@ Result<DelegateHandle*> NeuronBackend::init(
68130 Payload.Header .DataLen ,
69131 processed->size ());
70132
71- MemoryAllocator* runtime_allocator = context.get_runtime_allocator ();
72- NeuronExecuTorchDelegate* delegate =
73- runtime_allocator->allocateInstance <NeuronExecuTorchDelegate>();
74- if (delegate == nullptr ) {
75- return Error::MemoryAllocationFailed;
76- }
77-
78- new (delegate) NeuronExecuTorchDelegate ();
79-
80- if (delegate == nullptr ) {
81- return nullptr ;
82- }
83- auto res = delegate->LoadCompiledNetwork (Payload, setting);
133+ int res = delegate->LoadCompiledNetwork (Payload, setting);
84134 return res == NEURON_NO_ERROR ? delegate : nullptr ;
85135}
86136
@@ -112,21 +162,22 @@ Error NeuronExecuTorchDelegate::execute(
112162 return Error::InvalidState;
113163 };
114164
165+ ET_CHECK_OR_RETURN_ERROR (
166+ CheckDimOrder (args) == NEURON_NO_ERROR,
167+ Internal,
168+ " Expecting default dim_order but got a non default dim_order tensor input" );
169+
170+ PrepareInputsOuputs (args);
171+
115172 auto allocator =
116173 dynamic_cast <neuron::BufferAllocator*>(context.get_temp_allocator ());
117- size_t inputCount = mInputSizes .size (), outputCount = mOutputSizes .size ();
118-
119- for (int i = 0 ; i < inputCount; i++) {
120- auto tensor_in = args[i]->toTensor ();
121- ET_CHECK_OR_RETURN_ERROR (
122- runtime::is_contiguous_dim_order (
123- tensor_in.dim_order ().data (), tensor_in.dim ()),
124- Internal,
125- " Expecting default dim_order but got a non default dim_order tensor for external input %u" ,
126- i);
127-
128- auto data_ptr = args[i]->toTensor ().data_ptr ();
129- auto data_size = args[i]->toTensor ().nbytes ();
174+
175+ size_t inputCount = mInputSizes .size () + neuron_shared_weights_.size ();
176+ size_t outputCount = mOutputSizes .size ();
177+
178+ for (size_t i = 0 ; i < inputCount; i++) {
179+ auto data_ptr = mPreparedInputs [i].data_ptr ;
180+ auto data_size = mPreparedInputs [i].size ;
130181 if (IsCached</* isInput=*/ true >(i, data_ptr)) {
131182 continue ;
132183 };
@@ -141,22 +192,20 @@ Error NeuronExecuTorchDelegate::execute(
141192 }
142193 }
143194
144- for (int o = inputCount; o < inputCount + outputCount; o++) {
145- auto data_ptr = args[o]->toTensor ().data_ptr ();
146- auto data_size = args[o]->toTensor ().nbytes ();
147- auto output_index = o - inputCount;
148- if (IsCached</* isInput=*/ false >(output_index, data_ptr)) {
195+ for (size_t o = 0 ; o < outputCount; o++) {
196+ auto data_ptr = mPreparedOutputs [o].data_ptr ;
197+ auto data_size = mPreparedOutputs [o].size ;
198+ if (IsCached</* isInput=*/ false >(o, data_ptr)) {
149199 continue ;
150200 };
151201 auto unit = allocator != nullptr ? allocator->Find (data_ptr) : nullptr ;
152202 if (unit) {
153- UpdateCache</* isInput=*/ false >(output_index , data_ptr);
203+ UpdateCache</* isInput=*/ false >(o , data_ptr);
154204 size_t offset = (char *)data_ptr - (char *)unit->GetAddress ();
155205 mExecutor .SetInputOutputFromMemory </* isInput*/ false >(
156- output_index , unit->GetNeuronMemory (), offset, data_size);
206+ o , unit->GetNeuronMemory (), offset, data_size);
157207 } else {
158- mExecutor .SetInputOutput </* isInput=*/ false >(
159- output_index, data_ptr, data_size);
208+ mExecutor .SetInputOutput </* isInput=*/ false >(o, data_ptr, data_size);
160209 }
161210 }
162211
0 commit comments