From 0d556523328dd848441ffb140a577ba027ad5b9b Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Wed, 4 Sep 2024 14:34:59 -0700
Subject: [PATCH 01/22] Update Android build for MTK 1. MTK requires rtti and
 exceptions 2. MTK requires Android 26+

---
 backends/mediatek/CMakeLists.txt | 1 +
 build/build_android_llm_demo.sh  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index 4b233d94f04..60c08fe8757 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -25,6 +25,7 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
 
 # targets
 add_library(neuron_backend SHARED)
+target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions")
 target_link_libraries(
   neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log
                          ${NEURON_BUFFER_ALLOCATOR_LIB}
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 7b7150de210..389c6d95172 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -23,7 +23,7 @@ build_android_native_library() {
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-23 \
+    -DANDROID_PLATFORM=android-26 \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_XNNPACK=ON \

From 4ce04e8685687d847c99b558c18eb39a1a9b78c7 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Wed, 4 Sep 2024 14:44:57 -0700
Subject: [PATCH 02/22] Install neuron_backend in executorch-config

---
 build/executorch-config.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 4376c9e5e77..c40f214133a 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -41,6 +41,7 @@ set(lib_list
     ${FLATCCRT_LIB}
     coremldelegate
     mpsdelegate
+    neuron_backend
     qnn_executorch_backend
     portable_ops_lib
     extension_module

From 902569c21b83322efe100ee4dde061d78ab26edd Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:35:49 -0700
Subject: [PATCH 03/22] Create a MTK Runner to be able to run with a mobile app

---
 .../llm_helper/include/llama_runner_values.h  |  40 +++
 .../executor_runner/mtk_llama_runner.cpp      | 326 ++++++++++++++++++
 2 files changed, 366 insertions(+)
 create mode 100644 examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
 create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.cpp

diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
new file mode 100644
index 00000000000..b117d44e1f2
--- /dev/null
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -0,0 +1,40 @@
+#pragma once
+
+namespace torch::executor {
+  using llm_helper::LLMType;
+
+  // Sizes
+  const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
+  const size_t CACHE_SIZE = 512;
+  const size_t HIDDEN_SIZE = 4096;
+  const size_t NUM_HEAD = 32;
+  const size_t NUM_LAYER = 32;
+  const size_t MAX_TOKEN_LENGTH = 8192;
+  const double ROT_EMB_BASE = 500000;
+
+  // Types
+  const LLMType MODEL_INPUT_TYPE = LLMType::FP32;
+  const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32;
+  const LLMType CACHE_TYPE = LLMType::FP32;
+  const LLMType MASK_TYPE = LLMType::FP32;
+  const LLMType ROT_EMB_TYPE = LLMType::FP32;
+
+  // Paths
+  const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model"
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin"
+
+  // Comma-Separated Paths
+  const std::string PROMPT_MODEL_PATHS="\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,"
+
+  // Comma-Separated Paths
+  const std::string GEN_MODEL_PATHS="\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,"
+
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
new file mode 100644
index 00000000000..c74ac65ce1c
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 MediaTek Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* Copyright Statement:
+ *
+ * This software/firmware and related documentation ("MediaTek Software") are
+ * protected under relevant copyright laws. The information contained herein
+ * is confidential and proprietary to MediaTek Inc. and/or its licensors.
+ * Without the prior written permission of MediaTek inc. and/or its licensors,
+ * any reproduction, modification, use or disclosure of MediaTek Software,
+ * and information contained herein, in whole or in part, shall be strictly
+ * prohibited.
+ */
+/* MediaTek Inc. (C) 2024. All rights reserved.
+ *
+ * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
+ * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
+ * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
+ * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
+ * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
+ * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
+ * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
+ * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
+ * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
+ * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
+ * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
+ * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
+ * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
+ * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
+ * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
+ * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
+ * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
+ *
+ * The following software/firmware and/or related documentation ("MediaTek
+ * Software") have been modified by MediaTek Inc. All revisions are subject to
+ * any receiver's applicable license agreements with MediaTek Inc.
+ */
+
+#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/profiler.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/util/util.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/core/result.h>
+
+#include "llama_runner/LlamaConfig.h"
+#include "llama_runner/LlamaRuntime.h"
+#include "llama_runner/ModelChunk.h"
+#include "llama_runner/Utils.h"
+#include "llama_runner/llm_helper/include/llm_types.h"
+#include "llama_runner/llm_helper/include/llama_runner_values.h"
+
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+
+// Global BOS and EOS option for tokenization (encoding)
+static constexpr int8_t kAddBos = 1;
+static constexpr int8_t kAddEos = 0;
+
+using namespace torch::executor;
+using namespace torch::executor::llm_helper;
+using torch::executor::utils::Timer;
+
+Runner::MTKLlamaRunner(
+  const std::string& model_path,
+  const std::string& tokenizer_path,
+  const float temperature)
+  : modeloptions_(get_model_options()),
+    modelpaths_(get_model_paths()) {
+  ET_LOG(
+        Info,
+        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files.");
+}
+
+Error Runner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  // Load tokenizer
+  ET_LOG(Info, "Loading tokenizer.");
+  tokenizer_ = load_tokenizer();
+
+  // Load prompt model
+  ET_LOG(Info, "Loading prompt model.");
+  runtime_->Initialize(modeloptions_, modelpaths_);
+}
+
+bool Runner::is_loaded() const {
+  return tokenizer_ && runtime_;
+}
+
+Error Runner::generate(
+    const std::string& prompt,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback](const std::string& piece) {
+        util::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+  
+  ET_LOG(Info, "Starting inference.");    
+  inference(runtime_, tokenizer_, prompt, wrapped_callback);
+}
+
+void Runner::stop() {
+  if (is_loaded()) {
+    runtime_->Release();
+  } else {
+    ET_LOG(Error, "Llama Runtime is not loaded, cannot stop");
+  }
+}
+
+LlamaModelOptions get_model_options() {
+  LlamaModelOptions options = {
+      // Sizes
+      .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE,
+      .cache_size = CACHE_SIZE,
+      .hidden_size = HIDDEN_SIZE,
+      .num_head = NUM_HEAD,
+      .num_layer = NUM_LAYER,
+      .max_token_length = MAX_TOKEN_LENGTH,
+      .rot_emb_base = ROT_EMB_BASE,
+
+      // Types
+      .model_input_type = MODEL_INPUT_TYPE,
+      .model_output_type = MODEL_OUTPUT_TYPE,
+      .cache_type = CACHE_TYPE,
+      .mask_type = MASK_TYPE,
+      .rot_emb_type = ROT_EMB_TYPE};
+  return options;
+}
+
+LlamaModelPaths get_model_paths() {
+  LlamaModelPaths model_paths = {
+      .tokenizer_path = TOKENIZER_PATH,
+      .token_embedding_path = TOKEN_EMBEDDING_PATH,
+      .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','),
+      .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')};
+  return model_paths;
+}
+
+Result<uint64_t> digest_prompt(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::vector<uint64_t> input_tokens) {
+  const auto input_token_count = input_tokens.size();
+  const auto prompt_token_batch_size = llama_runtime.GetTokenBatchSize();
+  size_t cur_token_index = 0;
+
+  Timer timer_digest_prompt([=](const auto elapsed_sec) {
+    // Ideal prompt size is a multiple of prompt batch size
+    const size_t ideal_prompt_size =
+        std::ceil(float(input_token_count) / prompt_token_batch_size) *
+        prompt_token_batch_size;
+    ET_LOG(
+        Info,
+        "Done analyzing prompt in %f sec (%f tok/s)",
+        elapsed_sec,
+        (float)ideal_prompt_size / elapsed_sec);
+  });
+
+  auto getNextTokens = [&]() {
+    const size_t num_tok_remain = input_token_count - cur_token_index;
+    const size_t remainder = num_tok_remain % prompt_token_batch_size;
+    const size_t num_new_tokens =
+        remainder ? remainder : prompt_token_batch_size;
+    const auto start = cur_token_index;
+    const auto end = start + num_new_tokens;
+    return std::vector(
+        input_tokens.begin() + start, input_tokens.begin() + end);
+  };
+
+  void* logits;
+  timer_digest_prompt.Start();
+  while (cur_token_index < input_token_count) {
+    const auto next_tokens = getNextTokens();
+    ET_LOG(
+        Debug,
+        "Digest next tokens (size=%zu), 1st tok=%lu",
+        next_tokens.size(),
+        next_tokens[0]);
+    logits = llama_runtime.Run(next_tokens);
+    cur_token_index += next_tokens.size();
+  }
+  timer_digest_prompt.End();
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+  const auto first_output_token =
+      utils::argmax(logits_type, logits, vocab_size);
+  return first_output_token;
+}
+
+Error gen_response(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const uint64_t input_token,
+    std::function<void(const std::string&)> token_callback) {
+  Timer timer_model_swap(
+      [](const auto elapsed_sec) { ET_LOG(Info, "Model swapped."); });
+
+  // Swap to gen mode
+  timer_model_swap.Start();
+  llama_runtime.SwapModel(1);
+  timer_model_swap.End();
+
+  size_t gen_tok_count = 0;
+  uint64_t prev_token = input_token;
+  uint64_t output_token = input_token;
+
+  auto decode_res = tokenizer->decode(prev_token, output_token);
+  ET_CHECK_OR_RETURN_ERROR(
+      decode_res.ok(),
+      InvalidState,
+      "Tokenizer failed to decode first generated token: %lu",
+      output_token);
+  std::string full_response = std::move(decode_res.get());
+  std::vector<uint64_t> full_response_tokens = {input_token};
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+
+  double gen_total_time_sec = 0;
+  Timer timer_gen_token(
+      [&](const auto elapsed_sec) { gen_total_time_sec += elapsed_sec; });
+
+  // Print first output token
+  token_callback(full_response);
+
+  while (gen_tok_count++ < model_options_.max_response &&
+         llama_runtime.GetTokenIndex() < model_options_.max_token_length) {
+    timer_gen_token.Start();
+    void* logits = llama_runtime.Run({output_token});
+    timer_gen_token.End();
+
+    prev_token = output_token;
+    output_token = utils::argmax(logits_type, logits, vocab_size);
+    full_response_tokens.push_back(output_token);
+
+    // Stop when output is EOS
+    if (output_token == tokenizer->eos_tok()) {
+      token_callback("</eos>");
+      break;
+    }
+    auto decode_res = tokenizer->decode(prev_token, output_token);
+    ET_CHECK_OR_RETURN_ERROR(
+        decode_res.ok(),
+        InvalidState,
+        "Tokenizer failed to decode generated token %lu",
+        output_token);
+    const std::string tok_str = std::move(decode_res.get());
+    full_response += tok_str;
+    token_callback(tok_str);
+  }
+
+  std::cout << "\n\n[Generated Tokens]\n"
+            << utils::to_string(full_response_tokens) << std::endl;
+
+  ET_LOG(
+      Info,
+      "Token generation speed: %f tok/s",
+      gen_tok_count / gen_total_time_sec);
+
+  return Error::Ok;
+}
+
+Error inference(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::string& prompt
+    std::function<void(const std::string&)> token_callback) {
+  // Tokenize input prompt
+  auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos);
+  ET_CHECK_OR_RETURN_ERROR(
+      encode_res.ok(), InvalidState, "Tokenizer failed to encode prompt");
+  const auto input_tokens = std::move(encode_res.get());
+
+  // Run prompt mode (pre-fill)
+  auto prefill_res = digest_prompt(llama_runtime, tokenizer, input_tokens);
+  ET_CHECK_OR_RETURN_ERROR(
+      prefill_res.ok(), InvalidState, "Failed to digest prompt");
+  const auto first_output_token = prefill_res.get();
+
+  // run generation mode (decoding)
+  return gen_response(llama_runtime, tokenizer, first_output_token, token_callback);
+}
+
+std::unique_ptr<Tokenizer> load_tokenizer() {
+  std::unique_ptr<Tokenizer> tokenizer;
+  // Assumes that tokenizer type is Tiktoken
+  tokenizer = std::make_unique<torch::executor::Tiktoken>();
+  tokenizer->load(modelpaths_.tokenizer_path);
+  return tokenizer;
+}

From b21f1f605072d116aeee902b4bd318f2192d46b4 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Wed, 4 Sep 2024 14:53:32 -0700
Subject: [PATCH 04/22] TEST ONLY try to build mtk stuff

---
 build/build_android_llm_demo.sh                        | 6 ++++--
 examples/mediatek/executor_runner/mtk_llama_runner.cpp | 4 ++--
 extension/android/CMakeLists.txt                       | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 389c6d95172..66752092625 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -34,6 +34,8 @@ build_android_native_library() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_NEURON=ON \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"
 
@@ -47,7 +49,7 @@ build_android_native_library() {
   cmake examples/models/llama2 \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="$ANDROID_ABI" \
-    -DANDROID_PLATFORM=android-23 \
+    -DANDROID_PLATFORM=android-26 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -61,7 +63,7 @@ build_android_native_library() {
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-23 \
+    -DANDROID_PLATFORM=android-26 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index c74ac65ce1c..81de2bb8415 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -45,7 +45,7 @@
  */
 
 #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
-#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+// #include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 
 #include <ctime>
 #include <iostream>
@@ -59,7 +59,7 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
+// #include <executorch/util/util.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/runtime/core/result.h>
 
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 6827ae79040..23f5ac631bc 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -41,6 +41,7 @@ list(
   extension_runner_util
   extension_threadpool
   fbjni
+  neuron_backend
 )
 
 if(TARGET optimized_native_cpu_ops_lib)
@@ -100,7 +101,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
 
   target_link_options_shared_lib(quantized_ops_lib)
 
-  set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp)
+  set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp)
   add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS})
   if(TARGET pthreadpool)
     target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1)

From 72428f583d4b13acc65d223bc0b9b4f36a25e84a Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 5 Sep 2024 01:16:32 -0700
Subject: [PATCH 05/22] Fix Runner compilation errors

---
 backends/mediatek/runtime/include/NeuronLog.h |  2 +-
 .../llm_helper/include/llama_runner_values.h  |  8 +--
 .../executor_runner/mtk_llama_runner.cpp      | 41 +++++------
 .../executor_runner/mtk_llama_runner.h        | 69 +++++++++++++++++++
 4 files changed, 93 insertions(+), 27 deletions(-)
 create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.h

diff --git a/backends/mediatek/runtime/include/NeuronLog.h b/backends/mediatek/runtime/include/NeuronLog.h
index ccf8b24870d..5367a91ac4e 100644
--- a/backends/mediatek/runtime/include/NeuronLog.h
+++ b/backends/mediatek/runtime/include/NeuronLog.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <api/NeuronAdapter.h>
+#include "api/NeuronAdapter.h"
 
 #include <android/log.h>
 #include <sys/system_properties.h>
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index b117d44e1f2..aeff7254de1 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -20,21 +20,21 @@ namespace torch::executor {
   const LLMType ROT_EMB_TYPE = LLMType::FP32;
 
   // Paths
-  const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model"
-  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin"
+  const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin";
 
   // Comma-Separated Paths
   const std::string PROMPT_MODEL_PATHS="\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,"
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,";
 
   // Comma-Separated Paths
   const std::string GEN_MODEL_PATHS="\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\
   /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,"
+  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,";
 
 } // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 81de2bb8415..c1c94642ab9 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -45,7 +45,7 @@
  */
 
 #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
-// #include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 
 #include <ctime>
 #include <iostream>
@@ -63,16 +63,12 @@
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/runtime/core/result.h>
 
-#include "llama_runner/LlamaConfig.h"
-#include "llama_runner/LlamaRuntime.h"
 #include "llama_runner/ModelChunk.h"
 #include "llama_runner/Utils.h"
 #include "llama_runner/llm_helper/include/llm_types.h"
 #include "llama_runner/llm_helper/include/llama_runner_values.h"
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
-#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
-
+static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate.
 // Global BOS and EOS option for tokenization (encoding)
 static constexpr int8_t kAddBos = 1;
 static constexpr int8_t kAddEos = 0;
@@ -81,7 +77,7 @@ using namespace torch::executor;
 using namespace torch::executor::llm_helper;
 using torch::executor::utils::Timer;
 
-Runner::MTKLlamaRunner(
+MTKLlamaRunner::MTKLlamaRunner(
   const std::string& model_path,
   const std::string& tokenizer_path,
   const float temperature)
@@ -92,7 +88,7 @@ Runner::MTKLlamaRunner(
         "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files.");
 }
 
-Error Runner::load() {
+Error MTKLlamaRunner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
@@ -102,15 +98,16 @@ Error Runner::load() {
   tokenizer_ = load_tokenizer();
 
   // Load prompt model
+  runtime_ = std::make_unique<LlamaRuntime>();
   ET_LOG(Info, "Loading prompt model.");
   runtime_->Initialize(modeloptions_, modelpaths_);
 }
 
-bool Runner::is_loaded() const {
+bool MTKLlamaRunner::is_loaded() const {
   return tokenizer_ && runtime_;
 }
 
-Error Runner::generate(
+Error MTKLlamaRunner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
@@ -131,10 +128,10 @@ Error Runner::generate(
       };
   
   ET_LOG(Info, "Starting inference.");    
-  inference(runtime_, tokenizer_, prompt, wrapped_callback);
+  inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback);
 }
 
-void Runner::stop() {
+void MTKLlamaRunner::stop() {
   if (is_loaded()) {
     runtime_->Release();
   } else {
@@ -142,7 +139,7 @@ void Runner::stop() {
   }
 }
 
-LlamaModelOptions get_model_options() {
+LlamaModelOptions MTKLlamaRunner::get_model_options() {
   LlamaModelOptions options = {
       // Sizes
       .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE,
@@ -162,7 +159,7 @@ LlamaModelOptions get_model_options() {
   return options;
 }
 
-LlamaModelPaths get_model_paths() {
+LlamaModelPaths MTKLlamaRunner::get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = TOKENIZER_PATH,
       .token_embedding_path = TOKEN_EMBEDDING_PATH,
@@ -171,7 +168,7 @@ LlamaModelPaths get_model_paths() {
   return model_paths;
 }
 
-Result<uint64_t> digest_prompt(
+Result<uint64_t> MTKLlamaRunner::digest_prompt(
     LlamaRuntime& llama_runtime,
     const std::unique_ptr<Tokenizer>& tokenizer,
     const std::vector<uint64_t> input_tokens) {
@@ -223,7 +220,7 @@ Result<uint64_t> digest_prompt(
   return first_output_token;
 }
 
-Error gen_response(
+Error MTKLlamaRunner::gen_response(
     LlamaRuntime& llama_runtime,
     const std::unique_ptr<Tokenizer>& tokenizer,
     const uint64_t input_token,
@@ -259,8 +256,8 @@ Error gen_response(
   // Print first output token
   token_callback(full_response);
 
-  while (gen_tok_count++ < model_options_.max_response &&
-         llama_runtime.GetTokenIndex() < model_options_.max_token_length) {
+  while (gen_tok_count++ < MAX_RESPONSE &&
+         llama_runtime.GetTokenIndex() < modeloptions_.max_token_length) {
     timer_gen_token.Start();
     void* logits = llama_runtime.Run({output_token});
     timer_gen_token.End();
@@ -296,10 +293,10 @@ Error gen_response(
   return Error::Ok;
 }
 
-Error inference(
+Error MTKLlamaRunner::inference(
     LlamaRuntime& llama_runtime,
     const std::unique_ptr<Tokenizer>& tokenizer,
-    const std::string& prompt
+    const std::string& prompt,
     std::function<void(const std::string&)> token_callback) {
   // Tokenize input prompt
   auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos);
@@ -317,10 +314,10 @@ Error inference(
   return gen_response(llama_runtime, tokenizer, first_output_token, token_callback);
 }
 
-std::unique_ptr<Tokenizer> load_tokenizer() {
+std::unique_ptr<Tokenizer> MTKLlamaRunner::load_tokenizer() {
   std::unique_ptr<Tokenizer> tokenizer;
   // Assumes that tokenizer type is Tiktoken
-  tokenizer = std::make_unique<torch::executor::Tiktoken>();
+  tokenizer = torch::executor::get_tiktoken_for_llama();
   tokenizer->load(modelpaths_.tokenizer_path);
   return tokenizer;
 }
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
new file mode 100644
index 00000000000..d9f85c20257
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
+
+#include "llama_runner/LlamaConfig.h"
+#include "llama_runner/LlamaRuntime.h"
+using namespace torch::executor;
+using Stats = ::executorch::llm::Stats;
+
+class MTKLlamaRunner {
+ public:
+  explicit MTKLlamaRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature = 0.8f);
+
+  bool is_loaded() const;
+  Error load();
+  Error generate(
+      const std::string& prompt,
+      int32_t seq_len = 128,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
+  void stop();
+
+  LlamaModelOptions get_model_options();
+  LlamaModelPaths get_model_paths();
+  Result<uint64_t> digest_prompt(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::vector<uint64_t> input_tokens);
+  Error gen_response(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const uint64_t input_token,
+      std::function<void(const std::string&)> token_callback);
+  Error inference(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::string& prompt,
+      std::function<void(const std::string&)> token_callback);
+  std::unique_ptr<Tokenizer> load_tokenizer();
+
+
+ private:
+  // model
+  const torch::executor::LlamaModelOptions modeloptions_;
+  const torch::executor::LlamaModelPaths modelpaths_;
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<LlamaRuntime> runtime_;
+};

From 626858ee25fb678e34ae161a43f20032e4b49a45 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Thu, 5 Sep 2024 10:56:17 -0700
Subject: [PATCH 06/22] Debug in progress

---
 build/build_android_llm_demo.sh  |  2 ++
 extension/android/CMakeLists.txt | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 66752092625..8ca9680d4dd 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -67,6 +67,8 @@ build_android_native_library() {
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
+    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"/extension/android
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 23f5ac631bc..128c17cf71a 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -101,7 +101,17 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
 
   target_link_options_shared_lib(quantized_ops_lib)
 
-  set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp)
+  set(
+    LLAMA_JNI_SRCS jni/jni_layer_llama.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+  )
   add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS})
   if(TARGET pthreadpool)
     target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1)
@@ -118,6 +128,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   endif()
   target_include_directories(
     executorch_llama_jni PRIVATE ${_common_include_directories}
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
   )
   target_link_libraries(
     executorch_llama_jni
@@ -129,6 +141,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     eigen_blas
     quantized_kernels
     quantized_ops_lib
+    ${NEURON_BUFFER_ALLOCATOR_LIB}
+
   )
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
   # link re2

From 7242dece1947ed8ae2da6d34ccd4d1acc502a24e Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Thu, 5 Sep 2024 11:02:55 -0700
Subject: [PATCH 07/22] find lib

---
 extension/android/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 128c17cf71a..891f051d72e 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -141,9 +141,13 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     eigen_blas
     quantized_kernels
     quantized_ops_lib
-    ${NEURON_BUFFER_ALLOCATOR_LIB}
-
   )
+  ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
+  FIND_LIBRARY(libneuron_buffer_allocator_LIBRARY libneuron_buffer_allocator PATHS /home/hsz/e3/executorch/libneuron_buffer_allocator.so)
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION "${libneuron_buffer_allocator_LIBRARY}")
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_IMPLIB "${libneuron_buffer_allocator_LIBRARY}")
+
+  target_link_libraries(executorch_llama_jni libneuron_buffer_allocator)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
   # link re2
   set(ABSL_ENABLE_INSTALL ON)

From 605c0dde2a67450974d69bfad308ceed33fea403 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:01:43 -0700
Subject: [PATCH 08/22] aar generates but with libneuron_backend.so error

---
 build/build_android_llm_demo.sh  | 2 +-
 extension/android/CMakeLists.txt | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 8ca9680d4dd..8a9be879e88 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -124,7 +124,7 @@ collect_artifacts_to_be_uploaded() {
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
-ANDROID_ABIS=("arm64-v8a" "x86_64")
+ANDROID_ABIS=("arm64-v8a")
 export ANDROID_ABIS
 
 ARTIFACTS_DIR_NAME="$1"
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 891f051d72e..cfd747a7228 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -143,9 +143,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     quantized_ops_lib
   )
   ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
-  FIND_LIBRARY(libneuron_buffer_allocator_LIBRARY libneuron_buffer_allocator PATHS /home/hsz/e3/executorch/libneuron_buffer_allocator.so)
-  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION "${libneuron_buffer_allocator_LIBRARY}")
-  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_IMPLIB "${libneuron_buffer_allocator_LIBRARY}")
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so)
 
   target_link_libraries(executorch_llama_jni libneuron_buffer_allocator)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})

From 0c2d0418ee68aa809eea36a471ce3a3daa096467 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:03:58 -0700
Subject: [PATCH 09/22] trying to fix libneuron_backend.so error

---
 extension/android/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index cfd747a7228..0aec0774ecf 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -145,7 +145,10 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
   SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so)
 
-  target_link_libraries(executorch_llama_jni libneuron_buffer_allocator)
+  ADD_LIBRARY(libneuron_backend SHARED IMPORTED)
+  SET_PROPERTY(TARGET libneuron_backend PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/cmake-out-android-arm64-v8a/backends/mediatek/libneuron_backend.so)
+
+  target_link_libraries(executorch_llama_jni libneuron_buffer_allocator libneuron_backend)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
   # link re2
   set(ABSL_ENABLE_INSTALL ON)

From fda0f6287e08b22ad971fba40514b552da246c42 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 11 Sep 2024 17:03:28 -0700
Subject: [PATCH 10/22] includes libneuron_backend.so error but issue with
 neuron_backend

---
 build/build_android_llm_demo.sh  | 5 ++++-
 extension/android/CMakeLists.txt | 5 +----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 8a9be879e88..7b367366dac 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -78,6 +78,9 @@ build_android_native_library() {
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
   cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+
+  cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
 }
 
 build_aar() {
@@ -91,7 +94,7 @@ build_aar() {
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
   zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml
+  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so AndroidManifest.xml
   popd
 }
 
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 0aec0774ecf..cfd747a7228 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -145,10 +145,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
   SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so)
 
-  ADD_LIBRARY(libneuron_backend SHARED IMPORTED)
-  SET_PROPERTY(TARGET libneuron_backend PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/cmake-out-android-arm64-v8a/backends/mediatek/libneuron_backend.so)
-
-  target_link_libraries(executorch_llama_jni libneuron_buffer_allocator libneuron_backend)
+  target_link_libraries(executorch_llama_jni libneuron_buffer_allocator)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
   # link re2
   set(ABSL_ENABLE_INSTALL ON)

From 8cab38e059f4942ea823337b2d9152a5016d7578 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 11 Sep 2024 18:27:14 -0700
Subject: [PATCH 11/22] resolves neuron_backend issue but now issue on .so or
 properly linking mtk runner

---
 backends/mediatek/CMakeLists.txt | 2 +-
 build/build_android_llm_demo.sh  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index 60c08fe8757..289944fbeba 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -27,7 +27,7 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
 add_library(neuron_backend SHARED)
 target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions")
 target_link_libraries(
-  neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log
+  neuron_backend PRIVATE executorch_no_prim_ops android log
                          ${NEURON_BUFFER_ALLOCATOR_LIB}
 )
 target_sources(
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 7b367366dac..b3fe6d4cb1c 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -81,6 +81,7 @@ build_android_native_library() {
 
   cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
   cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
 }
 
 build_aar() {
@@ -94,7 +95,7 @@ build_aar() {
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
   zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so AndroidManifest.xml
+  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
   popd
 }
 

From e5166ac1635cddd952315eb15a75eac5ab1bbc7f Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@fb.com>
Date: Wed, 11 Sep 2024 22:30:47 -0700
Subject: [PATCH 12/22] hack route to mtk runner

---
 extension/android/jni/jni_layer_llama.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index e6a9b5de58c..4b1284a7842 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -17,6 +17,7 @@
 
 #include <executorch/examples/models/llama2/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -68,6 +69,7 @@ class ExecuTorchLlamaJni
   int model_type_category_;
   std::unique_ptr<Runner> runner_;
   std::unique_ptr<MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<MTKLlamaRunner> mtk_llama_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -75,6 +77,7 @@ class ExecuTorchLlamaJni
 
   constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
+  constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3;
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
@@ -113,7 +116,11 @@ class ExecuTorchLlamaJni
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
-    }
+    } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_ = std::make_unique<MTKLlamaRunner>(
+        model_path->toStdString().c_str(),
+        tokenizer_path->toStdString().c_str(),
+        temperature);
   }
 
   jint generate(
@@ -152,6 +159,13 @@ class ExecuTorchLlamaJni
           [callback](std::string result) { callback->onResult(result); },
           [callback](const Stats& result) { callback->onStats(result); },
           echo);
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->generate(
+          prompt->toStdString(),
+          seq_len,
+          [callback](std::string result) { callback->onResult(result); },
+          [callback](const Stats& result) { callback->onStats(result); },
+          echo);
     }
     return 0;
   }
@@ -243,6 +257,8 @@ class ExecuTorchLlamaJni
       multi_modal_runner_->stop();
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->stop();
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->stop();
     }
   }
 
@@ -251,6 +267,8 @@ class ExecuTorchLlamaJni
       return static_cast<jint>(multi_modal_runner_->load());
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       return static_cast<jint>(runner_->load());
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      return static_cast<jint>(mtk_llama_runner_->load());
     }
     return static_cast<jint>(Error::InvalidArgument);
   }

From 22d9b110943765e7e025980476947accae48ec0a Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@fb.com>
Date: Wed, 11 Sep 2024 22:31:55 -0700
Subject: [PATCH 13/22] Make 1 until cmodi changes java side lol

---
 extension/android/jni/jni_layer_llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 4b1284a7842..6204eca3ceb 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -77,7 +77,7 @@ class ExecuTorchLlamaJni
 
   constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
-  constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3;
+  constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */;
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,

From 6107dc745336b0a07082d90ec9b57d779bcad8ba Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 11 Sep 2024 23:08:06 -0700
Subject: [PATCH 14/22] compilation fixes

---
 extension/android/jni/jni_layer_llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 6204eca3ceb..b33139a2743 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -121,6 +121,7 @@ class ExecuTorchLlamaJni
         model_path->toStdString().c_str(),
         tokenizer_path->toStdString().c_str(),
         temperature);
+    }
   }
 
   jint generate(
@@ -164,8 +165,7 @@ class ExecuTorchLlamaJni
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); },
-          echo);
+          [callback](const Stats& result) { callback->onStats(result); });
     }
     return 0;
   }

From 1aa2ccdd44038d81d854344f732522a4f21d6c3e Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 12 Sep 2024 12:07:43 -0700
Subject: [PATCH 15/22] resolved no op issue and bug in modelType. Issue with
 loading models

---
 backends/mediatek/CMakeLists.txt                 |  1 -
 .../llm_helper/include/llama_runner_values.h     | 16 ++++------------
 .../executor_runner/mtk_llama_runner.cpp         |  4 ++++
 extension/android/jni/jni_layer_llama.cpp        |  2 +-
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index a444a6ff8ab..9a9c82d90a9 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -29,7 +29,6 @@ target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions")
 target_link_libraries(neuron_backend
     PRIVATE
     executorch_no_prim_ops
-    portable_ops_lib
     android
     log
     ${NEURON_BUFFER_ALLOCATOR_LIB}
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index aeff7254de1..1f388b2acdc 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -20,21 +20,13 @@ namespace torch::executor {
   const LLMType ROT_EMB_TYPE = LLMType::FP32;
 
   // Paths
-  const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model";
-  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin";
+  const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3_8b_instruct_fp32.bin";
 
   // Comma-Separated Paths
-  const std::string PROMPT_MODEL_PATHS="\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,";
+  const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";
 
   // Comma-Separated Paths
-  const std::string GEN_MODEL_PATHS="\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\
-  /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,";
+  const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
 
 } // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index c1c94642ab9..242dc18101d 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -96,11 +96,13 @@ Error MTKLlamaRunner::load() {
   // Load tokenizer
   ET_LOG(Info, "Loading tokenizer.");
   tokenizer_ = load_tokenizer();
+  ET_LOG(Info, "Complete loading tokenizer.");
 
   // Load prompt model
   runtime_ = std::make_unique<LlamaRuntime>();
   ET_LOG(Info, "Loading prompt model.");
   runtime_->Initialize(modeloptions_, modelpaths_);
+  ET_LOG(Info, "Complete loading prompt model.");
 }
 
 bool MTKLlamaRunner::is_loaded() const {
@@ -156,6 +158,7 @@ LlamaModelOptions MTKLlamaRunner::get_model_options() {
       .cache_type = CACHE_TYPE,
       .mask_type = MASK_TYPE,
       .rot_emb_type = ROT_EMB_TYPE};
+  ET_LOG(Info, "Completed get_model_options");    
   return options;
 }
 
@@ -165,6 +168,7 @@ LlamaModelPaths MTKLlamaRunner::get_model_paths() {
       .token_embedding_path = TOKEN_EMBEDDING_PATH,
       .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','),
       .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')};
+  ET_LOG(Info, "Completed get_model_paths");   
   return model_paths;
 }
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index b33139a2743..50476df5690 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -75,7 +75,7 @@ class ExecuTorchLlamaJni
   constexpr static auto kJavaDescriptor =
       "Lorg/pytorch/executorch/LlamaModule;";
 
-  constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
+  constexpr static int MODEL_TYPE_CATEGORY_LLM = 3 /* should be put back to 1*/;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
   constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */;
 

From 6fed4cbe47eeef0e0e0f0d7d5ab68fba55ddaff3 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 12 Sep 2024 12:34:14 -0700
Subject: [PATCH 16/22] *MILESTONE* debugging logs for model loading in aar

---
 .../llama_runner/LlamaRuntime.cpp             | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
index 0d2d5ccd59c..88ac44fc41d 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
@@ -28,14 +28,21 @@ void LlamaRuntime::Initialize(
   const size_t numCache = 2 * modelOptions.num_layer / numChunk;
   ET_CHECK_MSG(numChunk > 0, "No model to initialize");
 
+  ET_LOG(Info, "cmodiii 1");
+  ET_LOG(Info, "cmodiii numChunk = %zu", numChunk);
+  ET_LOG(Info, "cmodiii numCache = %zu", numCache);
+
   // Initialize rotary embedding master lookup table
   const size_t rotEmbDim = modelOptions.hidden_size / modelOptions.num_head;
+  ET_LOG(Info, "cmodiii 2");
   mRotEmbMasterLut = std::make_unique<llm_helper::RotaryEmbeddingMasterLut>(
       modelOptions.rot_emb_type,
       modelOptions.max_token_length,
       rotEmbDim,
       modelOptions.rot_emb_base);
+  ET_LOG(Info, "cmodiii 3");
   mRotEmbMasterLut->generate();
+  ET_LOG(Info, "cmodiii 4");
 
   constexpr size_t numRotEmbInputs = 1;
   const bool usePromptModel = !modelPaths.prompt_model_paths.empty();
@@ -50,8 +57,10 @@ void LlamaRuntime::Initialize(
         return;
       modelPathMap[batchSize] = modelPaths[chunkIdx];
     };
+    ET_LOG(Info, "cmodiii 5");
     addModelPath(
         modelPaths.prompt_model_paths, modelOptions.prompt_token_batch_size);
+    ET_LOG(Info, "cmodiii 6");
     addModelPath(modelPaths.gen_model_paths, 1);
     auto llamaChunk = std::make_unique<LlamaModelChunk>(
         modelPathMap,
@@ -60,18 +69,25 @@ void LlamaRuntime::Initialize(
         numCache,
         numRotEmbInputs,
         mRotEmbMasterLut.get());
+    ET_LOG(Info, "cmodiii 7");
     mLlamaModelChunks.push_back(std::move(llamaChunk));
+    ET_LOG(Info, "cmodiii 8");
   }
 
   for (size_t i = 0; i < numChunk; i++) {
     auto& modelChunk = mLlamaModelChunks[i];
+    ET_LOG(Info, "cmodiii 9");
     if (i > 0) {
       const auto& prevModelChunk = mLlamaModelChunks[i - 1];
+      ET_LOG(Info, "cmodiii 9A");
       modelChunk->SetInputBuffer(prevModelChunk->GetOutputBuffer());
+      ET_LOG(Info, "cmodiii 10");
     }
     modelChunk->Initialize();
+    ET_LOG(Info, "cmodiii 11");
     // modelChunk->LogIoSummary();
   }
+  ET_LOG(Info, "cmodiii 12");
 
   // NOTE: Token embedding type here is assumed to follow the model input
   // embedding type.
@@ -80,9 +96,13 @@ void LlamaRuntime::Initialize(
       modelOptions.model_input_type,
       modelOptions.hidden_size);
 
+  ET_LOG(Info, "cmodiii 13");
+
   // Link first chunk emb input to token emb lut output
   const auto& tokenEmbInput = mLlamaModelChunks.front()->GetInputBuffer();
+  ET_LOG(Info, "cmodiii 14");
   mTokenEmbLut->setOutput(tokenEmbInput.data, tokenEmbInput.nbytes);
+  ET_LOG(Info, "cmodiii 15");
 }
 
 void LlamaRuntime::Release() {
@@ -201,4 +221,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const {
   return mModelOptions;
 }
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace torch::executor

From 8a2d9c14f8209670796e666d3bad57af8836b88c Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Fri, 13 Sep 2024 10:08:34 -0700
Subject: [PATCH 17/22] debugging with moving some to shell

---
 .../llama_runner/LlamaRuntime.cpp             | 18 +++++++++++++
 .../llama_runner/ModelChunk.cpp               |  6 +++++
 .../mtk_llama_executor_runner.cpp             | 26 +++++++++++--------
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
index 88ac44fc41d..8a12ce90ecb 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
@@ -70,12 +70,30 @@ void LlamaRuntime::Initialize(
         numRotEmbInputs,
         mRotEmbMasterLut.get());
     ET_LOG(Info, "cmodiii 7");
+    if(llamaChunk.get() == nullptr) {
+      ET_LOG(Info, "cmodiii llamaChunk is null");
+    } else {
+      ET_LOG(Info, "cmodiii llamaChunk is not null");
+    }
+
     mLlamaModelChunks.push_back(std::move(llamaChunk));
+    
+    if(mLlamaModelChunks.empty()) {
+      ET_LOG(Info, "cmodiii mLlamaModelChunks is empty");
+    } else {
+      ET_LOG(Info, "cmodiii mLlamaModelChunks is not empty");
+    }
+
     ET_LOG(Info, "cmodiii 8");
   }
 
   for (size_t i = 0; i < numChunk; i++) {
     auto& modelChunk = mLlamaModelChunks[i];
+    if(modelChunk.get() == nullptr) {
+      ET_LOG(Info, "cmodiii modelChunk is null");
+    } else {
+      ET_LOG(Info, "cmodiii modelChunk is not null");
+    }
     ET_LOG(Info, "cmodiii 9");
     if (i > 0) {
       const auto& prevModelChunk = mLlamaModelChunks[i - 1];
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
index b09e2c58767..a7e858dfa93 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
@@ -46,11 +46,17 @@ struct ModelInstance {
 };
 
 void ModelChunk::Initialize() {
+  ET_LOG(Info, "cmodiii in ModuleChunk::Initialize");
   LoadModels();
+  ET_LOG(Info, "cmodiii after LoadModels");
   GetModelIoInfo();
+  ET_LOG(Info, "cmodiii after GetModelIoInfo");
   AllocateIoBuffers();
+  ET_LOG(Info, "cmodiii after AllocateIoBuffers");
   SetBackendInputs();
+  ET_LOG(Info, "cmodiii after SetBackendInputs");
   SetBackendOutputs();
+  ET_LOG(Info, "cmodiii after SetBackendOutputs");
   mIsInitialized = true;
 }
 
diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
index 370695cb773..1193e2b1830 100644
--- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
@@ -147,11 +147,11 @@ LlamaModelOptions get_model_options() {
       .rot_emb_base = FLAGS_rot_emb_base,
 
       // Types
-      .model_input_type = getLLMTypeFromName(FLAGS_input_type.c_str()),
-      .model_output_type = getLLMTypeFromName(FLAGS_output_type.c_str()),
-      .cache_type = getLLMTypeFromName(FLAGS_cache_type.c_str()),
-      .mask_type = getLLMTypeFromName(FLAGS_mask_type.c_str()),
-      .rot_emb_type = getLLMTypeFromName(FLAGS_rot_emb_type.c_str())};
+      .model_input_type = LLMType::FP32,
+      .model_output_type = LLMType::FP32,
+      .cache_type = LLMType::FP32,
+      .mask_type = LLMType::FP32,
+      .rot_emb_type = LLMType::FP32};
   return options;
 }
 
@@ -159,8 +159,8 @@ LlamaModelPaths get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = FLAGS_tokenizer_path,
       .token_embedding_path = FLAGS_token_embedding_path,
-      .prompt_model_paths = utils::split(FLAGS_prompt_model_paths, ','),
-      .gen_model_paths = utils::split(FLAGS_gen_model_paths, ',')};
+      .prompt_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,", ','),
+      .gen_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,", ',')};
   return model_paths;
 }
 
@@ -353,23 +353,27 @@ int main(int argc, char** argv) {
   Timer timer_release(
       [](const auto elapsed_sec) { ET_LOG(Info, "Model released."); });
 
-  LlamaRuntime llama_runtime;
+  //LlamaRuntime llama_runtime;
+  std::unique_ptr<LlamaRuntime> llama_runtime = std::make_unique<LlamaRuntime>();
 
   // Initialize model
   ET_LOG(Info, "Begin model loading.");
   timer_init.Start();
   const auto tokenizer = load_tokenizer();
-  llama_runtime.Initialize(model_options, model_paths);
+  //llama_runtime.Initialize(model_options, model_paths);
+  llama_runtime->Initialize(model_options, model_paths);
   timer_init.End();
 
   // Run model
   ET_CHECK_MSG(!FLAGS_prompt_file.empty(), "No prompt file provided.");
   std::string prompt = utils::read_file(FLAGS_prompt_file);
-  inference(llama_runtime, tokenizer, prompt);
+  //inference(llama_runtime, tokenizer, prompt);
+  inference(*llama_runtime.get(), tokenizer, prompt);
 
   // Release model
   timer_release.Start();
-  llama_runtime.Release();
+  //llama_runtime.Release();
+  llama_runtime->Release();
   timer_release.End();
 
   return 0;

From 72aa142cc2ed6fc69228f58ad1e390e331735952 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 16 Sep 2024 14:45:36 -0700
Subject: [PATCH 18/22] many debug prints

---
 .../llama_runner/LlamaModelChunk.cpp             | 10 ++++++++++
 .../llama_runner/MultiModelLoader.cpp            | 16 +++++++++++++++-
 .../executor_runner/mtk_llama_runner.cpp         |  3 ++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
index c2d75fd30ec..1757c63fe21 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
@@ -73,16 +73,26 @@ size_t LlamaModelChunk::GetExpectedOutputCount() const {
 }
 
 void LlamaModelChunk::Initialize() {
+  ET_LOG(Info, "cmodiii in LlamaModelChunk::Initialize");
   LoadModels();
+  ET_LOG(Info, "cmodiii after LoadModels");
   GetModelIoInfo();
+  ET_LOG(Info, "cmodiii after GetModelIoInfo");
   CheckIoCount();
+  ET_LOG(Info, "cmodiii after CheckIoCount");
   PrepareCacheIOs();
+  ET_LOG(Info, "cmodiii after PrepareCacheIOs");
   AllocateIoBuffers();
+  ET_LOG(Info, "cmodiii after AllocateIoBuffers");
   InitMaskBuilder();
+  ET_LOG(Info, "cmodiii after InitMaskBuilder");
   InitCache();
+  ET_LOG(Info, "cmodiii after InitCache");
 
   SetBackendInputs();
+  ET_LOG(Info, "cmodiii after SetBackendInputs");
   SetBackendOutputs();
+  ET_LOG(Info, "cmodiii after SetBackendOutputs");
   mIsInitialized = true;
 }
 
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
index e0479110a7c..e20eac3b248 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
@@ -21,20 +21,28 @@ namespace torch::executor {
 template <typename IdType>
 void MultiModelLoader<IdType>::LoadModels() {
   // Init empty model instance map
+  ET_LOG(Info, "cmodi LoadModels() 1");
   for (const auto& [id, _] : mModelPathMap) {
     ET_CHECK_MSG(
         !HasModel(id),
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 2");
     mModelInstanceMap[id] = nullptr;
   }
   const size_t numModels = mModelPathMap.size();
+  ET_LOG(Info, "cmodi LoadModels() 3");
   if (!AllowModelsCoexist()) {
+    ET_LOG(Info, "cmodi LoadModels() 4");
     SelectModel(mDefaultModelId);
+    ET_LOG(Info, "cmodi LoadModels() 5");
     ET_CHECK_MSG(
         GetModelInstance() == nullptr,
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 6");
     void* instance = CreateModelInstance(mModelPathMap[mDefaultModelId]);
+    ET_LOG(Info, "cmodi LoadModels() 7");
     SetModelInstance(instance);
+    ET_LOG(Info, "cmodi LoadModels() 8");
     ET_LOG(
         Debug,
         "LoadModels(): Loaded single exclusive model (Total=%zu)",
@@ -42,14 +50,20 @@ void MultiModelLoader<IdType>::LoadModels() {
     return;
   }
   for (const auto& [id, modelPath] : mModelPathMap) {
+    ET_LOG(Info, "cmodi LoadModels() 9");
     SelectModel(id);
+    ET_LOG(Info, "cmodi LoadModels() 10");
     ET_CHECK_MSG(
         GetModelInstance() == nullptr,
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 11");
     void* instance = CreateModelInstance(modelPath);
+    ET_LOG(Info, "cmodi LoadModels() 12");
     SetModelInstance(instance);
+    ET_LOG(Info, "cmodi LoadModels() 13");
   }
   SelectModel(mDefaultModelId); // Select the default instance
+  ET_LOG(Info, "cmodi LoadModels() 14");
   ET_LOG(Debug, "LoadModels(): Loaded multiple models (Total=%zu)", numModels);
 }
 
@@ -174,4 +188,4 @@ std::string MultiModelLoader<IdType>::GetIdString(const IdType& id) {
 template class MultiModelLoader<int>;
 template class MultiModelLoader<size_t>;
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 242dc18101d..72296de45ac 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -83,9 +83,10 @@ MTKLlamaRunner::MTKLlamaRunner(
   const float temperature)
   : modeloptions_(get_model_options()),
     modelpaths_(get_model_paths()) {
+  runtime_init();
   ET_LOG(
         Info,
-        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files.");
+        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
 }
 
 Error MTKLlamaRunner::load() {

From 91e4d6a843d0f52f874df66e6cda049c96e113a4 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 16 Sep 2024 20:40:04 -0700
Subject: [PATCH 19/22] logs and app changes for debug

---
 build/build_android_llm_demo.sh               |  8 ++++----
 .../executorchllamademo/MainActivity.java     | 20 ++++++++++---------
 .../executorchllamademo/SettingsActivity.java |  6 ++++--
 .../llama_runner/ModelChunk.cpp               | 17 ++++++++++++++++
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 8047c947067..d8de4cfd94e 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -46,7 +46,7 @@ build_android_native_library() {
     -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -B"${CMAKE_OUT}"
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -54,7 +54,7 @@ build_android_native_library() {
   else
     CMAKE_JOBS=$(( $(nproc) - 1 ))
   fi
-  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
+  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config RelWithDebInfo
 
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
@@ -67,10 +67,10 @@ build_android_native_library() {
     -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -B"${CMAKE_OUT}"/extension/android
 
-  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
+  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config RelWithDebInfo
 
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index f5e50845eca..a08bf87a079 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -156,11 +156,11 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
               + " sec."
               + " You can send text or image for inference";
 
-      if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+      /*if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
         ETLogging.getInstance().log("Llava start prefill prompt");
         startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0);
         ETLogging.getInstance().log("Llava completes prefill prompt");
-      }
+      }*/
     }
 
     Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
@@ -226,6 +226,9 @@ protected void onCreate(Bundle savedInstanceState) {
 
     try {
       Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+      Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+      ETLogging.getInstance().log("cmodiiiii ADSP_LIBRARY_PATH is: " + Os.getenv("ADSP_LIBRARY_PATH"));
+      ETLogging.getInstance().log("cmodiiiii LD_LIBRARY_PATH is: " + Os.getenv("LD_LIBRARY_PATH"));
     } catch (ErrnoException e) {
       finish();
     }
@@ -566,7 +569,7 @@ private void showMediaPreview(List<Uri> uris) {
 
     // For LLava, we want to call prefill_image as soon as an image is selected
     // Llava only support 1 image for now
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+/*    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
       List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
       if (!processedImageList.isEmpty()) {
         mMessageAdapter.add(
@@ -588,7 +591,7 @@ private void showMediaPreview(List<Uri> uris) {
             };
         executor.execute(runnable);
       }
-    }
+    }*/
   }
 
   private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
@@ -689,7 +692,7 @@ public void run() {
                         }
                       });
                   long generateStartTime = System.currentTimeMillis();
-                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+                 /* if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
                       == ModelUtils.VISION_MODEL) {
                     mModule.generateFromPos(
                         mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt),
@@ -697,16 +700,15 @@ public void run() {
                         startPos,
                         MainActivity.this,
                         false);
-                  } else {
+                  } else {*/
                     String finalPrompt =
                         getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
                     ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
                     mModule.generate(
                         finalPrompt,
                         (int) (finalPrompt.length() * 0.75) + 64,
-                        MainActivity.this,
-                        false);
-                  }
+                        MainActivity.this);
+                  //}
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
                   mResultMessage.setTotalGenerationTime(generateDuration);
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 773fef19dd7..9d7d2f4ec2a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -46,6 +46,8 @@ public class SettingsActivity extends AppCompatActivity {
 
   private DemoSharedPreferences mDemoSharedPreferences;
   public static double TEMPERATURE_MIN_VALUE = 0.0;
+  public static String MODEL_PATH="/data/local/tmp/et-mtk/llama3";
+  //public static String MODEL_PATH="/data/local/tmp/llama";
 
   @Override
   protected void onCreate(Bundle savedInstanceState) {
@@ -286,7 +288,7 @@ private void showInvalidPromptDialog() {
   }
 
   private void setupModelSelectorDialog() {
-    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
+    String[] pteFiles = listLocalFile(MODEL_PATH, ".pte");
     AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
     modelPathBuilder.setTitle("Select model path");
 
@@ -342,7 +344,7 @@ private void setupModelTypeSelectorDialog() {
   }
 
   private void setupTokenizerSelectorDialog() {
-    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    String[] binFiles = listLocalFile(MODEL_PATH, ".bin");
     String[] tokenizerFiles = new String[binFiles.length];
     System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
     AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
index a7e858dfa93..2c7e236968d 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
@@ -486,18 +486,22 @@ Method& ModelChunk::GetModelMethod() {
 
 // Override the virtual functions
 void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
+  ET_LOG(Info, "cmodi in CreateModelInstance");
   auto modelInstance = new ModelInstance;
+  ET_LOG(Info, "cmodi 100");
 
   // Create a loader to get the data of the program file. There are other
   // DataLoaders that use mmap() or point to data that's already in memory, and
   // users can create their own DataLoaders to load from arbitrary sources.
   Result<FileDataLoader> loader = FileDataLoader::from(modelPath.c_str());
+  ET_LOG(Info, "cmodi 101");
   ET_CHECK_MSG(
       loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
   Result<Program> program_loaded = Program::load(&loader.get());
+  ET_LOG(Info, "cmodi 102");
   if (!program_loaded.ok()) {
     ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str());
     return nullptr;
@@ -508,12 +512,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   // methods.
   modelInstance->program =
       std::make_unique<Program>(std::move(program_loaded.get()));
+  ET_LOG(Info, "cmodi 103");
   auto& program = modelInstance->program;
+  ET_LOG(Info, "cmodi 104");
 
   // Use the first method in the program.
   const char* method_name = nullptr;
   {
     const auto method_name_result = program->get_method_name(0);
+    ET_LOG(Info, "cmodi 105");
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
@@ -530,12 +537,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize);
   modelInstance->method_allocator = std::make_unique<MemoryAllocator>(
       kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data());
+  ET_LOG(Info, "cmodi 106");
   auto& method_allocator = modelInstance->method_allocator;
   method_allocator->enable_profiling("method allocator");
 
   auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory
   auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator
 
+  ET_LOG(Info, "cmodi 107");
+
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
     // .get() will always succeed because id < num_memory_planned_buffers.
@@ -545,22 +555,28 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
     planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
     planned_spans.push_back({planned_buffers.back().get(), buffer_size});
   }
+  ET_LOG(Info, "cmodi 108");
   modelInstance->planned_memory = std::make_unique<HierarchicalAllocator>(
       Span<Span<uint8_t>>{planned_spans.data(), planned_spans.size()});
   auto& planned_memory = modelInstance->planned_memory;
 
+  ET_LOG(Info, "cmodi 109");
   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
   auto& neuron_allocator = GET_NEURON_ALLOCATOR;
+  ET_LOG(Info, "cmodi 110");
   modelInstance->memory_manager = std::make_unique<MemoryManager>(
       method_allocator.get(),
       planned_memory.get(),
       dynamic_cast<MemoryAllocator*>(&neuron_allocator));
+  ET_LOG(Info, "cmodi 111");
   auto& memory_manager = modelInstance->memory_manager;
+  ET_LOG(Info, "cmodi 112");
 
   ET_LOG(Debug, "Begin loading method %s", method_name);
   Result<Method> method =
       program->load_method(method_name, memory_manager.get());
+  ET_LOG(Info, "cmodi 113");
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
@@ -569,6 +585,7 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   ET_LOG(Debug, "Method loaded.");
 
   modelInstance->method = std::make_unique<Method>(std::move(method.get()));
+  ET_LOG(Info, "cmodi 114");
   return modelInstance;
 }
 

From aa98fb12343cd4592c5f5ed2a91b0d927d66e2f3 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 16 Sep 2024 22:23:21 -0700
Subject: [PATCH 20/22] so adds in androidmanifest and embedding file name
 correction

---
 .../android/LlamaDemo/app/src/main/AndroidManifest.xml    | 8 ++++++++
 .../llama_runner/llm_helper/include/llama_runner_values.h | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index 02d8503a4df..e6ff8e95b87 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -36,6 +36,14 @@
             android:name="libcdsprpc.so"
             android:required="false" />
 
+        <uses-native-library
+            android:name="libneuronusdk_adapter.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libapuwareutils_v2.mtk.so"
+            android:required="false" />
+
         <activity
             android:name=".MainActivity"
             android:exported="true"
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index 1f388b2acdc..c653f8b032e 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -21,7 +21,7 @@ namespace torch::executor {
 
   // Paths
   const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
-  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3_8b_instruct_fp32.bin";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8b-instruct_fp32.bin";
 
   // Comma-Separated Paths
   const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";

From 29f3333671183ba8b93cef5422cd39b0073cd0cf Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:09:23 -0700
Subject: [PATCH 21/22] .so adds in AndroidManifest and fix type for embedding
 file name

---
 .../android/LlamaDemo/app/build.gradle.kts    |  2 +-
 .../app/src/main/AndroidManifest.xml          | 20 +++++++++++++++++--
 .../llm_helper/include/llama_runner_values.h  |  2 +-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index 37c8cbf0ba2..039ea9b047f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama.aar"))
+  implementation(files("libs/executorch-llama-mtk29.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index e6ff8e95b87..799ce50992f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -37,11 +37,27 @@
             android:required="false" />
 
         <uses-native-library
-            android:name="libneuronusdk_adapter.mtk.so"
+            android:name="libapuwareutils_v2.mtk.so"
             android:required="false" />
 
         <uses-native-library
-            android:name="libapuwareutils_v2.mtk.so"
+            android:name="libapuwareapusys_v2.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.so"
             android:required="false" />
 
         <activity
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index c653f8b032e..98cd8ab394e 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -21,7 +21,7 @@ namespace torch::executor {
 
   // Paths
   const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
-  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8b-instruct_fp32.bin";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin";
 
   // Comma-Separated Paths
   const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";

From a4b427980372584485254c580310f3787a272945 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:59:38 -0700
Subject: [PATCH 22/22] add Error returns to runner. Baseline working flow.

---
 examples/demo-apps/android/LlamaDemo/app/build.gradle.kts  | 2 +-
 .../java/com/example/executorchllamademo/MainActivity.java | 1 +
 examples/mediatek/executor_runner/mtk_llama_runner.cpp     | 7 ++++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index 039ea9b047f..db4ea8f74c6 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama-mtk29.aar"))
+  implementation(files("libs/executorch-llama-mtk31.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index a08bf87a079..fbd6948880f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -126,6 +126,7 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
             modelPath,
             tokenizerPath,
             temperature);
+    ETLogging.getInstance().log("ModelType is: " + mCurrentSettingsFields.getModelType());
     int loadResult = mModule.load();
     long loadDuration = System.currentTimeMillis() - runStartTime;
     String modelLoadError = "";
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 72296de45ac..dbb5b79b42c 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -104,6 +104,8 @@ Error MTKLlamaRunner::load() {
   ET_LOG(Info, "Loading prompt model.");
   runtime_->Initialize(modeloptions_, modelpaths_);
   ET_LOG(Info, "Complete loading prompt model.");
+
+  return Error::Ok;
 }
 
 bool MTKLlamaRunner::is_loaded() const {
@@ -130,8 +132,11 @@ Error MTKLlamaRunner::generate(
         }
       };
   
-  ET_LOG(Info, "Starting inference.");    
+  ET_LOG(Info, "Starting inference from MTKLlamaRunner");    
   inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback);
+  ET_LOG(Info, "Completed inference from MTKLlamaRunner"); 
+
+  return Error::Ok;
 }
 
 void MTKLlamaRunner::stop() {