pytorch
diff --git a/‎backends/mediatek/runtime/NeuronBackend.cpp‎
Lines changed: 28 additions & 8 deletions b/‎backends/mediatek/runtime/NeuronBackend.cpp‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎backends/mediatek/runtime/include/NeuronBackend.h‎
Lines changed: 53 additions & 9 deletions b/‎backends/mediatek/runtime/include/NeuronBackend.h‎
Lines changed: 53 additions & 9 deletions
diff --git a/‎examples/mediatek/executor_runner/llama_runner/LlamaConfig.h‎
Lines changed: 1 addition & 0 deletions b/‎examples/mediatek/executor_runner/llama_runner/LlamaConfig.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp‎
Lines changed: 26 additions & 0 deletions b/‎examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h‎
Lines changed: 12 additions & 0 deletions b/‎examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp‎
Lines changed: 28 additions & 7 deletions b/‎examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp‎
Lines changed: 28 additions & 7 deletions
@@ -73,7 +73,26 @@ Result<DelegateHandle*> NeuronBackend::init(
           "NeuronBackend",
           "SharedWeights Enabled for %s",
           shared_weights_key.c_str());
-
+      std::shared_ptr<NeuronSharedWeights> neuron_shared_weights;
+      if (neuron_shared_weights_cache_.find(shared_weights_key) !=
+          neuron_shared_weights_cache_.end()) {
+        neuron_shared_weights =
+            neuron_shared_weights_cache_.at(shared_weights_key).lock();
+        if (neuron_shared_weights) {
+          LogInfo(
+              "NeuronBackend",
+              "Reusing cached shared weights with key %s",
+              shared_weights_key.c_str());
+          delegate->SetSharedWeights(neuron_shared_weights);
+          continue;
+        } else {
+          LogInfo(
+              "NeuronBackend",
+              "Shared weights cache expired: %s",
+              shared_weights_key.c_str());
+          neuron_shared_weights_cache_.erase(shared_weights_key); // Expired
+        }
+      }
       const NamedDataMap* named_data_map = context.get_named_data_map();
       Result<FreeableBuffer> shared_weights =
           named_data_map->get_data(shared_weights_key.c_str());
@@ -84,7 +103,11 @@ Result<DelegateHandle*> NeuronBackend::init(
             "Loaded shared weights from named_data_map. Size: %zu",
             shared_weights.get().size());
         FreeableBuffer& buffer = shared_weights.get();
-        delegate->SetSharedWeights(buffer);
+        neuron_shared_weights =
+            std::make_shared<NeuronSharedWeights>(std::move(buffer));
+        delegate->SetSharedWeights(neuron_shared_weights);
+        neuron_shared_weights_cache_[shared_weights_key] =
+            neuron_shared_weights;
       } else {
         LogError(
             "NeuronBackend",
@@ -148,13 +171,10 @@ Error NeuronExecuTorchDelegate::execute(
   auto allocator = dynamic_cast<torch::executor::neuron::BufferAllocator*>(
       context.get_temp_allocator());
 
-  bool has_shared_weights_input = neuron_shared_weights_.size() > 0;
-
-  size_t inputCount =
-      has_shared_weights_input ? mInputSizes.size() + 1 : mInputSizes.size();
+  size_t inputCount = mInputSizes.size() + neuron_shared_weights_.size();
   size_t outputCount = mOutputSizes.size();
 
-  for (int i = 0; i < inputCount; i++) {
+  for (size_t i = 0; i < inputCount; i++) {
     auto data_ptr = mPreparedInputs[i].data_ptr;
     auto data_size = mPreparedInputs[i].size;
     if (IsCached</*isInput=*/true>(i, data_ptr)) {
@@ -171,7 +191,7 @@ Error NeuronExecuTorchDelegate::execute(
     }
   }
 
-  for (int o = 0; o < outputCount; o++) {
+  for (size_t o = 0; o < outputCount; o++) {
     auto data_ptr = mPreparedOutputs[o].data_ptr;
     auto data_size = mPreparedOutputs[o].size;
     if (IsCached</*isInput=*/false>(o, data_ptr)) {
 
@@ -32,6 +32,45 @@ using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 
+class NeuronSharedWeights {
+ public:
+  explicit NeuronSharedWeights(const FreeableBuffer& shared_weights_buffer) {
+    auto& buffer_allocator = GET_NEURON_ALLOCATOR;
+    nbytes_ = shared_weights_buffer.size();
+    data_ = buffer_allocator.Allocate(nbytes_);
+    ET_CHECK_MSG(
+        data_ != nullptr,
+        "Error: Failed to allocate memory for shared weights of size %zu",
+        nbytes_);
+    std::memcpy(data_, shared_weights_buffer.data(), nbytes_);
+  }
+
+  explicit NeuronSharedWeights(FreeableBuffer&& shared_weights_buffer)
+      : NeuronSharedWeights(shared_weights_buffer) {
+    shared_weights_buffer.Free();
+  }
+
+  ~NeuronSharedWeights() {
+    if (data_ == nullptr || nbytes_ == 0) {
+      return;
+    }
+    auto& buffer_allocator = GET_NEURON_ALLOCATOR;
+    buffer_allocator.RemoveBuffer(data_);
+  }
+
+  void* data() const {
+    return data_;
+  }
+
+  size_t size() const {
+    return nbytes_;
+  }
+
+ private:
+  void* data_ = nullptr;
+  size_t nbytes_ = 0;
+};
+
 class NeuronBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
@@ -48,6 +87,10 @@ class NeuronBackend final : public ::executorch::runtime::BackendInterface {
   void destroy(::executorch::runtime::DelegateHandle* handle) const override;
 
   bool is_available() const override;
+
+ private:
+  mutable std::unordered_map<std::string, std::weak_ptr<NeuronSharedWeights>>
+      neuron_shared_weights_cache_;
 };
 
 extern const char kHighAddrKey[];
@@ -79,8 +122,7 @@ class NeuronExecuTorchDelegate {
     void* data_ptr;
     size_t size;
 
-    InputOutputInfo(void* ptr, size_t sz)
-        : data_ptr(ptr), size(sz) {}
+    InputOutputInfo(void* ptr, size_t sz) : data_ptr(ptr), size(sz) {}
   };
 
   class MemoryCache {
@@ -129,8 +171,8 @@ class NeuronExecuTorchDelegate {
     return NEURON_NO_ERROR;
   }
 
-  int SetSharedWeights(FreeableBuffer& buffer) {
-    neuron_shared_weights_.push_back(std::move(buffer));
+  int SetSharedWeights(std::shared_ptr<NeuronSharedWeights> sharedWeights) {
+    neuron_shared_weights_.push_back(sharedWeights);
     return NEURON_NO_ERROR;
   }
 
@@ -202,11 +244,12 @@ class NeuronExecuTorchDelegate {
       mPreparedInputs.push_back(InputOutputInfo{data_ptr, data_size});
     }
 
-    // Prepare shared weights if any as the last model input
+    // Prepare shared weights if any as the last model inputs
     if (has_shared_weights_input) {
-      FreeableBuffer& buffer = neuron_shared_weights_.at(0);
-      mPreparedInputs.push_back(
-          InputOutputInfo{const_cast<void*>(buffer.data()), buffer.size()});
+      for (const auto& shared_weights : neuron_shared_weights_) {
+        mPreparedInputs.push_back(
+            InputOutputInfo{shared_weights->data(), shared_weights->size()});
+      }
     }
 
     // Prepare output data
@@ -242,7 +285,8 @@ class NeuronExecuTorchDelegate {
 
   mutable std::unordered_set<const void*> mHasImported;
 
-  mutable std::vector<FreeableBuffer> neuron_shared_weights_;
+  mutable std::vector<std::shared_ptr<NeuronSharedWeights>>
+      neuron_shared_weights_;
 
  private:
   NeuronExecuTorchDelegate(const NeuronExecuTorchDelegate&);
 
@@ -40,6 +40,7 @@ struct LlamaModelPaths {
   std::string token_embedding_path;
   std::vector<std::string> prompt_model_paths;
   std::vector<std::string> gen_model_paths;
+  std::vector<std::string> model_package_paths;
 };
 
 } // namespace example
@@ -21,6 +21,7 @@
 
 #include "LlamaConfig.h"
 #include "LlamaModelChunk.h"
+#include "Utils.h"
 #include "llm_helper/include/llm_types.h"
 
 #include "llm_helper/include/mask_builder.h"
@@ -42,11 +43,13 @@ inline std::vector<size_t> getIndexRange(
 LlamaModelChunk::LlamaModelChunk(
     const ModelPathMap& modelPathMap,
     const LlamaModelOptions& modelOptions,
+    const bool useSharedWeights,
     const size_t initBatchSize,
     const size_t numCache,
     const size_t numRotEmbInputs,
     const RotaryEmbeddingMasterLut* rotEmbMasterLut)
     : ModelChunk(modelPathMap, initBatchSize),
+      kIsSharedWeightsUsed(useSharedWeights),
       kMaxTokenLength(modelOptions.max_token_length),
       kCacheLength(modelOptions.cache_size),
       kMaskType(modelOptions.mask_type),
@@ -61,6 +64,29 @@ LlamaModelChunk::LlamaModelChunk(
 
 LlamaModelChunk::~LlamaModelChunk() {}
 
+std::string LlamaModelChunk::SelectMethod(
+    const std::vector<std::string>& methodNames) const {
+  const size_t curTokenSize = GetModelId();
+  for (const auto& methodName : methodNames) {
+    const auto matches = utils::extract_substr(methodName, "([0-9]+)t[0-9]+c");
+    ET_CHECK_MSG(
+        matches.size() == 2, "Invalid method name: %s", methodName.c_str());
+    // Extract the first match group as token size
+    const size_t methodTokenSize =
+        static_cast<size_t>(std::atol(matches[1].c_str()));
+    if (curTokenSize == methodTokenSize) {
+      ET_LOG(
+          Debug,
+          "Selected method \"%s\" for token size %zu",
+          methodName.c_str(),
+          curTokenSize);
+      return methodName;
+    }
+  }
+  ET_LOG(Error, "Unable to find suitable method, fallback to use the first method.");
+  return {};
+}
+
 size_t LlamaModelChunk::GetExpectedInputCount() const {
   const size_t rotEmbInputCount = kRotEmbInputIndexes.size();
   const size_t cacheInputCount = kCacheInputIndexes.size();
 
@@ -44,6 +44,7 @@ class LlamaModelChunk : public ModelChunk {
   explicit LlamaModelChunk(
       const ModelPathMap& modelPathMap,
       const LlamaModelOptions& modelOptions,
+      const bool useSharedWeights,
       const size_t initBatchSize,
       const size_t numCache,
       const size_t numRotEmbInputs,
@@ -104,6 +105,17 @@ class LlamaModelChunk : public ModelChunk {
   size_t GetExpectedOutputCount() const;
 
  private:
+  bool AllowModelsCoexist() const override {
+    return kIsSharedWeightsUsed;
+  }
+
+  std::string SelectMethod(
+      const std::vector<std::string>& methodNames) const override;
+
+ private:
+  // Whether shared weights is used
+  bool kIsSharedWeightsUsed = false;
+
   // Input/Output Indexes
   const size_t kMaskInputIndex;
   const std::vector<size_t> kRotEmbInputIndexes;
 
@@ -24,9 +24,6 @@ void LlamaRuntime::Initialize(
     const LlamaModelOptions& modelOptions,
     const LlamaModelPaths& modelPaths) {
   mModelOptions = modelOptions;
-  const size_t numChunk = modelPaths.gen_model_paths.size();
-  const size_t numCache = 2 * modelOptions.num_layer / numChunk;
-  ET_CHECK_MSG(numChunk > 0, "No model to initialize");
 
   // Initialize rotary embedding master lookup table
   const size_t rotEmbDim = modelOptions.hidden_size / modelOptions.num_head;
@@ -37,25 +34,49 @@ void LlamaRuntime::Initialize(
       modelOptions.rot_emb_base);
   mRotEmbMasterLut->generate();
 
+  const bool useSharedWeights = !modelPaths.model_package_paths.empty();
+
+  ET_CHECK_MSG(
+      !useSharedWeights ||
+          modelPaths.prompt_model_paths.empty() &&
+              modelPaths.gen_model_paths.empty(),
+      "The paths for both prompt and gen model paths should be empty when shared weights is used.");
+
+  const size_t numChunk = useSharedWeights
+      ? modelPaths.model_package_paths.size()
+      : modelPaths.gen_model_paths.size();
+  ET_CHECK_MSG(numChunk > 0, "No model to initialize");
+  const size_t numCache = 2 * modelOptions.num_layer / numChunk;
+
   constexpr size_t numRotEmbInputs = 1;
-  const bool usePromptModel = !modelPaths.prompt_model_paths.empty();
+  const bool usePromptModel = !modelPaths.prompt_model_paths.empty() ||
+      !modelPaths.model_package_paths.empty();
   const size_t initBatchSize =
       usePromptModel ? modelOptions.prompt_token_batch_size : 1;
   mTokenBatchSize = initBatchSize;
 
+  // Get effective prompt and gen model paths
+  const auto& [prompt_model_paths, gen_model_paths] = [&] {
+    if (useSharedWeights) {
+      return std::pair{
+          modelPaths.model_package_paths, modelPaths.model_package_paths};
+    }
+    return std::pair{modelPaths.prompt_model_paths, modelPaths.gen_model_paths};
+  }();
+
   for (size_t chunkIdx = 0; chunkIdx < numChunk; chunkIdx++) {
     ModelPathMap modelPathMap;
     auto addModelPath = [&](const auto& modelPaths, const size_t batchSize) {
       if (modelPaths.empty())
         return;
       modelPathMap[batchSize] = modelPaths[chunkIdx];
     };
-    addModelPath(
-        modelPaths.prompt_model_paths, modelOptions.prompt_token_batch_size);
-    addModelPath(modelPaths.gen_model_paths, 1);
+    addModelPath(prompt_model_paths, modelOptions.prompt_token_batch_size);
+    addModelPath(gen_model_paths, 1);
     auto llamaChunk = std::make_unique<LlamaModelChunk>(
         modelPathMap,
         modelOptions,
+        useSharedWeights,
         initBatchSize,
         numCache,
         numRotEmbInputs,