diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index 744b1193d5a..9a9c82d90a9 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -25,10 +25,10 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
 
 # targets
 add_library(neuron_backend SHARED)
+target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions")
 target_link_libraries(neuron_backend
     PRIVATE
     executorch_no_prim_ops
-    portable_ops_lib
     android
     log
     ${NEURON_BUFFER_ALLOCATOR_LIB}
diff --git a/backends/mediatek/runtime/include/NeuronLog.h b/backends/mediatek/runtime/include/NeuronLog.h
index ccf8b24870d..5367a91ac4e 100644
--- a/backends/mediatek/runtime/include/NeuronLog.h
+++ b/backends/mediatek/runtime/include/NeuronLog.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <api/NeuronAdapter.h>
+#include "api/NeuronAdapter.h"
 
 #include <android/log.h>
 #include <sys/system_properties.h>
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index c5d0118afda..d8de4cfd94e 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -30,7 +30,7 @@ build_android_native_library() {
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-23 \
+    -DANDROID_PLATFORM=android-26 \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -42,9 +42,11 @@ build_android_native_library() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_NEURON=ON \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -B"${CMAKE_OUT}"
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -52,26 +54,32 @@ build_android_native_library() {
   else
     CMAKE_JOBS=$(( $(nproc) - 1 ))
   fi
-  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
+  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config RelWithDebInfo
 
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-23 \
+    -DANDROID_PLATFORM=android-26 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
+    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -B"${CMAKE_OUT}"/extension/android
 
-  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
+  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config RelWithDebInfo
 
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
   cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
 
+  cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+
   # Copy QNN related so library
   if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
     cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
@@ -97,7 +105,7 @@ build_aar() {
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
   zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
+  zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
   popd
 }
 
@@ -143,6 +151,7 @@ BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
 if [ -z "$ANDROID_ABIS" ]; then
   ANDROID_ABIS=("arm64-v8a" "x86_64")
+  ANDROID_ABIS=("arm64-v8a")
 fi
 export ANDROID_ABIS
 
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 4376c9e5e77..c40f214133a 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -41,6 +41,7 @@ set(lib_list
     ${FLATCCRT_LIB}
     coremldelegate
     mpsdelegate
+    neuron_backend
     qnn_executorch_backend
     portable_ops_lib
     extension_module
diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index 37c8cbf0ba2..db4ea8f74c6 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama.aar"))
+  implementation(files("libs/executorch-llama-mtk31.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index 02d8503a4df..799ce50992f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -36,6 +36,30 @@
             android:name="libcdsprpc.so"
             android:required="false" />
 
+        <uses-native-library
+            android:name="libapuwareutils_v2.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libapuwareapusys_v2.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.so"
+            android:required="false" />
+
         <activity
             android:name=".MainActivity"
             android:exported="true"
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index f5e50845eca..fbd6948880f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -126,6 +126,7 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
             modelPath,
             tokenizerPath,
             temperature);
+    ETLogging.getInstance().log("ModelType is: " + mCurrentSettingsFields.getModelType());
     int loadResult = mModule.load();
     long loadDuration = System.currentTimeMillis() - runStartTime;
     String modelLoadError = "";
@@ -156,11 +157,11 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
               + " sec."
               + " You can send text or image for inference";
 
-      if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+      /*if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
         ETLogging.getInstance().log("Llava start prefill prompt");
         startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0);
         ETLogging.getInstance().log("Llava completes prefill prompt");
-      }
+      }*/
     }
 
     Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
@@ -226,6 +227,9 @@ protected void onCreate(Bundle savedInstanceState) {
 
     try {
       Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+      Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+      ETLogging.getInstance().log("cmodiiiii ADSP_LIBRARY_PATH is: " + Os.getenv("ADSP_LIBRARY_PATH"));
+      ETLogging.getInstance().log("cmodiiiii LD_LIBRARY_PATH is: " + Os.getenv("LD_LIBRARY_PATH"));
     } catch (ErrnoException e) {
       finish();
     }
@@ -566,7 +570,7 @@ private void showMediaPreview(List<Uri> uris) {
 
     // For LLava, we want to call prefill_image as soon as an image is selected
     // Llava only support 1 image for now
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+/*    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
       List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
       if (!processedImageList.isEmpty()) {
         mMessageAdapter.add(
@@ -588,7 +592,7 @@ private void showMediaPreview(List<Uri> uris) {
             };
         executor.execute(runnable);
       }
-    }
+    }*/
   }
 
   private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
@@ -689,7 +693,7 @@ public void run() {
                         }
                       });
                   long generateStartTime = System.currentTimeMillis();
-                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+                 /* if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
                       == ModelUtils.VISION_MODEL) {
                     mModule.generateFromPos(
                         mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt),
@@ -697,16 +701,15 @@ public void run() {
                         startPos,
                         MainActivity.this,
                         false);
-                  } else {
+                  } else {*/
                     String finalPrompt =
                         getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
                     ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
                     mModule.generate(
                         finalPrompt,
                         (int) (finalPrompt.length() * 0.75) + 64,
-                        MainActivity.this,
-                        false);
-                  }
+                        MainActivity.this);
+                  //}
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
                   mResultMessage.setTotalGenerationTime(generateDuration);
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 773fef19dd7..9d7d2f4ec2a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -46,6 +46,8 @@ public class SettingsActivity extends AppCompatActivity {
 
   private DemoSharedPreferences mDemoSharedPreferences;
   public static double TEMPERATURE_MIN_VALUE = 0.0;
+  public static String MODEL_PATH="/data/local/tmp/et-mtk/llama3";
+  //public static String MODEL_PATH="/data/local/tmp/llama";
 
   @Override
   protected void onCreate(Bundle savedInstanceState) {
@@ -286,7 +288,7 @@ private void showInvalidPromptDialog() {
   }
 
   private void setupModelSelectorDialog() {
-    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
+    String[] pteFiles = listLocalFile(MODEL_PATH, ".pte");
     AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
     modelPathBuilder.setTitle("Select model path");
 
@@ -342,7 +344,7 @@ private void setupModelTypeSelectorDialog() {
   }
 
   private void setupTokenizerSelectorDialog() {
-    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    String[] binFiles = listLocalFile(MODEL_PATH, ".bin");
     String[] tokenizerFiles = new String[binFiles.length];
     System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
     AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
index c2d75fd30ec..1757c63fe21 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
@@ -73,16 +73,26 @@ size_t LlamaModelChunk::GetExpectedOutputCount() const {
 }
 
 void LlamaModelChunk::Initialize() {
+  ET_LOG(Info, "cmodiii in LlamaModelChunk::Initialize");
   LoadModels();
+  ET_LOG(Info, "cmodiii after LoadModels");
   GetModelIoInfo();
+  ET_LOG(Info, "cmodiii after GetModelIoInfo");
   CheckIoCount();
+  ET_LOG(Info, "cmodiii after CheckIoCount");
   PrepareCacheIOs();
+  ET_LOG(Info, "cmodiii after PrepareCacheIOs");
   AllocateIoBuffers();
+  ET_LOG(Info, "cmodiii after AllocateIoBuffers");
   InitMaskBuilder();
+  ET_LOG(Info, "cmodiii after InitMaskBuilder");
   InitCache();
+  ET_LOG(Info, "cmodiii after InitCache");
 
   SetBackendInputs();
+  ET_LOG(Info, "cmodiii after SetBackendInputs");
   SetBackendOutputs();
+  ET_LOG(Info, "cmodiii after SetBackendOutputs");
   mIsInitialized = true;
 }
 
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
index 0d2d5ccd59c..8a12ce90ecb 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
@@ -28,14 +28,21 @@ void LlamaRuntime::Initialize(
   const size_t numCache = 2 * modelOptions.num_layer / numChunk;
   ET_CHECK_MSG(numChunk > 0, "No model to initialize");
 
+  ET_LOG(Info, "cmodiii 1");
+  ET_LOG(Info, "cmodiii numChunk = %zu", numChunk);
+  ET_LOG(Info, "cmodiii numCache = %zu", numCache);
+
   // Initialize rotary embedding master lookup table
   const size_t rotEmbDim = modelOptions.hidden_size / modelOptions.num_head;
+  ET_LOG(Info, "cmodiii 2");
   mRotEmbMasterLut = std::make_unique<llm_helper::RotaryEmbeddingMasterLut>(
       modelOptions.rot_emb_type,
       modelOptions.max_token_length,
       rotEmbDim,
       modelOptions.rot_emb_base);
+  ET_LOG(Info, "cmodiii 3");
   mRotEmbMasterLut->generate();
+  ET_LOG(Info, "cmodiii 4");
 
   constexpr size_t numRotEmbInputs = 1;
   const bool usePromptModel = !modelPaths.prompt_model_paths.empty();
@@ -50,8 +57,10 @@ void LlamaRuntime::Initialize(
         return;
       modelPathMap[batchSize] = modelPaths[chunkIdx];
     };
+    ET_LOG(Info, "cmodiii 5");
     addModelPath(
         modelPaths.prompt_model_paths, modelOptions.prompt_token_batch_size);
+    ET_LOG(Info, "cmodiii 6");
     addModelPath(modelPaths.gen_model_paths, 1);
     auto llamaChunk = std::make_unique<LlamaModelChunk>(
         modelPathMap,
@@ -60,18 +69,43 @@ void LlamaRuntime::Initialize(
         numCache,
         numRotEmbInputs,
         mRotEmbMasterLut.get());
+    ET_LOG(Info, "cmodiii 7");
+    if(llamaChunk.get() == nullptr) {
+      ET_LOG(Info, "cmodiii llamaChunk is null");
+    } else {
+      ET_LOG(Info, "cmodiii llamaChunk is not null");
+    }
+
     mLlamaModelChunks.push_back(std::move(llamaChunk));
+    
+    if(mLlamaModelChunks.empty()) {
+      ET_LOG(Info, "cmodiii mLlamaModelChunks is empty");
+    } else {
+      ET_LOG(Info, "cmodiii mLlamaModelChunks is not empty");
+    }
+
+    ET_LOG(Info, "cmodiii 8");
   }
 
   for (size_t i = 0; i < numChunk; i++) {
     auto& modelChunk = mLlamaModelChunks[i];
+    if(modelChunk.get() == nullptr) {
+      ET_LOG(Info, "cmodiii modelChunk is null");
+    } else {
+      ET_LOG(Info, "cmodiii modelChunk is not null");
+    }
+    ET_LOG(Info, "cmodiii 9");
     if (i > 0) {
       const auto& prevModelChunk = mLlamaModelChunks[i - 1];
+      ET_LOG(Info, "cmodiii 9A");
       modelChunk->SetInputBuffer(prevModelChunk->GetOutputBuffer());
+      ET_LOG(Info, "cmodiii 10");
     }
     modelChunk->Initialize();
+    ET_LOG(Info, "cmodiii 11");
     // modelChunk->LogIoSummary();
   }
+  ET_LOG(Info, "cmodiii 12");
 
   // NOTE: Token embedding type here is assumed to follow the model input
   // embedding type.
@@ -80,9 +114,13 @@ void LlamaRuntime::Initialize(
       modelOptions.model_input_type,
       modelOptions.hidden_size);
 
+  ET_LOG(Info, "cmodiii 13");
+
   // Link first chunk emb input to token emb lut output
   const auto& tokenEmbInput = mLlamaModelChunks.front()->GetInputBuffer();
+  ET_LOG(Info, "cmodiii 14");
   mTokenEmbLut->setOutput(tokenEmbInput.data, tokenEmbInput.nbytes);
+  ET_LOG(Info, "cmodiii 15");
 }
 
 void LlamaRuntime::Release() {
@@ -201,4 +239,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const {
   return mModelOptions;
 }
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
index b09e2c58767..2c7e236968d 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
@@ -46,11 +46,17 @@ struct ModelInstance {
 };
 
 void ModelChunk::Initialize() {
+  ET_LOG(Info, "cmodiii in ModuleChunk::Initialize");
   LoadModels();
+  ET_LOG(Info, "cmodiii after LoadModels");
   GetModelIoInfo();
+  ET_LOG(Info, "cmodiii after GetModelIoInfo");
   AllocateIoBuffers();
+  ET_LOG(Info, "cmodiii after AllocateIoBuffers");
   SetBackendInputs();
+  ET_LOG(Info, "cmodiii after SetBackendInputs");
   SetBackendOutputs();
+  ET_LOG(Info, "cmodiii after SetBackendOutputs");
   mIsInitialized = true;
 }
 
@@ -480,18 +486,22 @@ Method& ModelChunk::GetModelMethod() {
 
 // Override the virtual functions
 void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
+  ET_LOG(Info, "cmodi in CreateModelInstance");
   auto modelInstance = new ModelInstance;
+  ET_LOG(Info, "cmodi 100");
 
   // Create a loader to get the data of the program file. There are other
   // DataLoaders that use mmap() or point to data that's already in memory, and
   // users can create their own DataLoaders to load from arbitrary sources.
   Result<FileDataLoader> loader = FileDataLoader::from(modelPath.c_str());
+  ET_LOG(Info, "cmodi 101");
   ET_CHECK_MSG(
       loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
   Result<Program> program_loaded = Program::load(&loader.get());
+  ET_LOG(Info, "cmodi 102");
   if (!program_loaded.ok()) {
     ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str());
     return nullptr;
@@ -502,12 +512,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   // methods.
   modelInstance->program =
       std::make_unique<Program>(std::move(program_loaded.get()));
+  ET_LOG(Info, "cmodi 103");
   auto& program = modelInstance->program;
+  ET_LOG(Info, "cmodi 104");
 
   // Use the first method in the program.
   const char* method_name = nullptr;
   {
     const auto method_name_result = program->get_method_name(0);
+    ET_LOG(Info, "cmodi 105");
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
@@ -524,12 +537,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize);
   modelInstance->method_allocator = std::make_unique<MemoryAllocator>(
       kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data());
+  ET_LOG(Info, "cmodi 106");
   auto& method_allocator = modelInstance->method_allocator;
   method_allocator->enable_profiling("method allocator");
 
   auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory
   auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator
 
+  ET_LOG(Info, "cmodi 107");
+
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
     // .get() will always succeed because id < num_memory_planned_buffers.
@@ -539,22 +555,28 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
     planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
     planned_spans.push_back({planned_buffers.back().get(), buffer_size});
   }
+  ET_LOG(Info, "cmodi 108");
   modelInstance->planned_memory = std::make_unique<HierarchicalAllocator>(
       Span<Span<uint8_t>>{planned_spans.data(), planned_spans.size()});
   auto& planned_memory = modelInstance->planned_memory;
 
+  ET_LOG(Info, "cmodi 109");
   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
   auto& neuron_allocator = GET_NEURON_ALLOCATOR;
+  ET_LOG(Info, "cmodi 110");
   modelInstance->memory_manager = std::make_unique<MemoryManager>(
       method_allocator.get(),
       planned_memory.get(),
       dynamic_cast<MemoryAllocator*>(&neuron_allocator));
+  ET_LOG(Info, "cmodi 111");
   auto& memory_manager = modelInstance->memory_manager;
+  ET_LOG(Info, "cmodi 112");
 
   ET_LOG(Debug, "Begin loading method %s", method_name);
   Result<Method> method =
       program->load_method(method_name, memory_manager.get());
+  ET_LOG(Info, "cmodi 113");
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
@@ -563,6 +585,7 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
   ET_LOG(Debug, "Method loaded.");
 
   modelInstance->method = std::make_unique<Method>(std::move(method.get()));
+  ET_LOG(Info, "cmodi 114");
   return modelInstance;
 }
 
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
index e0479110a7c..e20eac3b248 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
@@ -21,20 +21,28 @@ namespace torch::executor {
 template <typename IdType>
 void MultiModelLoader<IdType>::LoadModels() {
   // Init empty model instance map
+  ET_LOG(Info, "cmodi LoadModels() 1");
   for (const auto& [id, _] : mModelPathMap) {
     ET_CHECK_MSG(
         !HasModel(id),
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 2");
     mModelInstanceMap[id] = nullptr;
   }
   const size_t numModels = mModelPathMap.size();
+  ET_LOG(Info, "cmodi LoadModels() 3");
   if (!AllowModelsCoexist()) {
+    ET_LOG(Info, "cmodi LoadModels() 4");
     SelectModel(mDefaultModelId);
+    ET_LOG(Info, "cmodi LoadModels() 5");
     ET_CHECK_MSG(
         GetModelInstance() == nullptr,
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 6");
     void* instance = CreateModelInstance(mModelPathMap[mDefaultModelId]);
+    ET_LOG(Info, "cmodi LoadModels() 7");
     SetModelInstance(instance);
+    ET_LOG(Info, "cmodi LoadModels() 8");
     ET_LOG(
         Debug,
         "LoadModels(): Loaded single exclusive model (Total=%zu)",
@@ -42,14 +50,20 @@ void MultiModelLoader<IdType>::LoadModels() {
     return;
   }
   for (const auto& [id, modelPath] : mModelPathMap) {
+    ET_LOG(Info, "cmodi LoadModels() 9");
     SelectModel(id);
+    ET_LOG(Info, "cmodi LoadModels() 10");
     ET_CHECK_MSG(
         GetModelInstance() == nullptr,
         "Model is already initialized before calling LoadModels.");
+    ET_LOG(Info, "cmodi LoadModels() 11");
     void* instance = CreateModelInstance(modelPath);
+    ET_LOG(Info, "cmodi LoadModels() 12");
     SetModelInstance(instance);
+    ET_LOG(Info, "cmodi LoadModels() 13");
   }
   SelectModel(mDefaultModelId); // Select the default instance
+  ET_LOG(Info, "cmodi LoadModels() 14");
   ET_LOG(Debug, "LoadModels(): Loaded multiple models (Total=%zu)", numModels);
 }
 
@@ -174,4 +188,4 @@ std::string MultiModelLoader<IdType>::GetIdString(const IdType& id) {
 template class MultiModelLoader<int>;
 template class MultiModelLoader<size_t>;
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
new file mode 100644
index 00000000000..98cd8ab394e
--- /dev/null
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -0,0 +1,32 @@
+#pragma once
+
+namespace torch::executor {
+  using llm_helper::LLMType;
+
+  // Sizes
+  const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
+  const size_t CACHE_SIZE = 512;
+  const size_t HIDDEN_SIZE = 4096;
+  const size_t NUM_HEAD = 32;
+  const size_t NUM_LAYER = 32;
+  const size_t MAX_TOKEN_LENGTH = 8192;
+  const double ROT_EMB_BASE = 500000;
+
+  // Types
+  const LLMType MODEL_INPUT_TYPE = LLMType::FP32;
+  const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32;
+  const LLMType CACHE_TYPE = LLMType::FP32;
+  const LLMType MASK_TYPE = LLMType::FP32;
+  const LLMType ROT_EMB_TYPE = LLMType::FP32;
+
+  // Paths
+  const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin";
+
+  // Comma-Separated Paths
+  const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";
+
+  // Comma-Separated Paths
+  const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
+
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
index 370695cb773..1193e2b1830 100644
--- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
@@ -147,11 +147,11 @@ LlamaModelOptions get_model_options() {
       .rot_emb_base = FLAGS_rot_emb_base,
 
       // Types
-      .model_input_type = getLLMTypeFromName(FLAGS_input_type.c_str()),
-      .model_output_type = getLLMTypeFromName(FLAGS_output_type.c_str()),
-      .cache_type = getLLMTypeFromName(FLAGS_cache_type.c_str()),
-      .mask_type = getLLMTypeFromName(FLAGS_mask_type.c_str()),
-      .rot_emb_type = getLLMTypeFromName(FLAGS_rot_emb_type.c_str())};
+      .model_input_type = LLMType::FP32,
+      .model_output_type = LLMType::FP32,
+      .cache_type = LLMType::FP32,
+      .mask_type = LLMType::FP32,
+      .rot_emb_type = LLMType::FP32};
   return options;
 }
 
@@ -159,8 +159,8 @@ LlamaModelPaths get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = FLAGS_tokenizer_path,
       .token_embedding_path = FLAGS_token_embedding_path,
-      .prompt_model_paths = utils::split(FLAGS_prompt_model_paths, ','),
-      .gen_model_paths = utils::split(FLAGS_gen_model_paths, ',')};
+      .prompt_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,", ','),
+      .gen_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,", ',')};
   return model_paths;
 }
 
@@ -353,23 +353,27 @@ int main(int argc, char** argv) {
   Timer timer_release(
       [](const auto elapsed_sec) { ET_LOG(Info, "Model released."); });
 
-  LlamaRuntime llama_runtime;
+  //LlamaRuntime llama_runtime;
+  std::unique_ptr<LlamaRuntime> llama_runtime = std::make_unique<LlamaRuntime>();
 
   // Initialize model
   ET_LOG(Info, "Begin model loading.");
   timer_init.Start();
   const auto tokenizer = load_tokenizer();
-  llama_runtime.Initialize(model_options, model_paths);
+  //llama_runtime.Initialize(model_options, model_paths);
+  llama_runtime->Initialize(model_options, model_paths);
   timer_init.End();
 
   // Run model
   ET_CHECK_MSG(!FLAGS_prompt_file.empty(), "No prompt file provided.");
   std::string prompt = utils::read_file(FLAGS_prompt_file);
-  inference(llama_runtime, tokenizer, prompt);
+  //inference(llama_runtime, tokenizer, prompt);
+  inference(*llama_runtime.get(), tokenizer, prompt);
 
   // Release model
   timer_release.Start();
-  llama_runtime.Release();
+  //llama_runtime.Release();
+  llama_runtime->Release();
   timer_release.End();
 
   return 0;
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
new file mode 100644
index 00000000000..dbb5b79b42c
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 MediaTek Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* Copyright Statement:
+ *
+ * This software/firmware and related documentation ("MediaTek Software") are
+ * protected under relevant copyright laws. The information contained herein
+ * is confidential and proprietary to MediaTek Inc. and/or its licensors.
+ * Without the prior written permission of MediaTek inc. and/or its licensors,
+ * any reproduction, modification, use or disclosure of MediaTek Software,
+ * and information contained herein, in whole or in part, shall be strictly
+ * prohibited.
+ */
+/* MediaTek Inc. (C) 2024. All rights reserved.
+ *
+ * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
+ * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
+ * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
+ * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
+ * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
+ * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
+ * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
+ * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
+ * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
+ * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
+ * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
+ * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
+ * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
+ * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
+ * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
+ * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
+ * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
+ *
+ * The following software/firmware and/or related documentation ("MediaTek
+ * Software") have been modified by MediaTek Inc. All revisions are subject to
+ * any receiver's applicable license agreements with MediaTek Inc.
+ */
+
+#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/profiler.h>
+#include <executorch/runtime/platform/runtime.h>
+// #include <executorch/util/util.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/core/result.h>
+
+#include "llama_runner/ModelChunk.h"
+#include "llama_runner/Utils.h"
+#include "llama_runner/llm_helper/include/llm_types.h"
+#include "llama_runner/llm_helper/include/llama_runner_values.h"
+
+static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate.
+// Global BOS and EOS option for tokenization (encoding)
+static constexpr int8_t kAddBos = 1;
+static constexpr int8_t kAddEos = 0;
+
+using namespace torch::executor;
+using namespace torch::executor::llm_helper;
+using torch::executor::utils::Timer;
+
+MTKLlamaRunner::MTKLlamaRunner(
+  const std::string& model_path,
+  const std::string& tokenizer_path,
+  const float temperature)
+  : modeloptions_(get_model_options()),
+    modelpaths_(get_model_paths()) {
+  runtime_init();
+  ET_LOG(
+        Info,
+        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
+}
+
+Error MTKLlamaRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  // Load tokenizer
+  ET_LOG(Info, "Loading tokenizer.");
+  tokenizer_ = load_tokenizer();
+  ET_LOG(Info, "Complete loading tokenizer.");
+
+  // Load prompt model
+  runtime_ = std::make_unique<LlamaRuntime>();
+  ET_LOG(Info, "Loading prompt model.");
+  runtime_->Initialize(modeloptions_, modelpaths_);
+  ET_LOG(Info, "Complete loading prompt model.");
+
+  return Error::Ok;
+}
+
+bool MTKLlamaRunner::is_loaded() const {
+  return tokenizer_ && runtime_;
+}
+
+Error MTKLlamaRunner::generate(
+    const std::string& prompt,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback](const std::string& piece) {
+        util::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+  
+  ET_LOG(Info, "Starting inference from MTKLlamaRunner");    
+  inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback);
+  ET_LOG(Info, "Completed inference from MTKLlamaRunner"); 
+
+  return Error::Ok;
+}
+
+void MTKLlamaRunner::stop() {
+  if (is_loaded()) {
+    runtime_->Release();
+  } else {
+    ET_LOG(Error, "Llama Runtime is not loaded, cannot stop");
+  }
+}
+
+LlamaModelOptions MTKLlamaRunner::get_model_options() {
+  LlamaModelOptions options = {
+      // Sizes
+      .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE,
+      .cache_size = CACHE_SIZE,
+      .hidden_size = HIDDEN_SIZE,
+      .num_head = NUM_HEAD,
+      .num_layer = NUM_LAYER,
+      .max_token_length = MAX_TOKEN_LENGTH,
+      .rot_emb_base = ROT_EMB_BASE,
+
+      // Types
+      .model_input_type = MODEL_INPUT_TYPE,
+      .model_output_type = MODEL_OUTPUT_TYPE,
+      .cache_type = CACHE_TYPE,
+      .mask_type = MASK_TYPE,
+      .rot_emb_type = ROT_EMB_TYPE};
+  ET_LOG(Info, "Completed get_model_options");    
+  return options;
+}
+
+LlamaModelPaths MTKLlamaRunner::get_model_paths() {
+  LlamaModelPaths model_paths = {
+      .tokenizer_path = TOKENIZER_PATH,
+      .token_embedding_path = TOKEN_EMBEDDING_PATH,
+      .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','),
+      .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')};
+  ET_LOG(Info, "Completed get_model_paths");   
+  return model_paths;
+}
+
+Result<uint64_t> MTKLlamaRunner::digest_prompt(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::vector<uint64_t> input_tokens) {
+  const auto input_token_count = input_tokens.size();
+  const auto prompt_token_batch_size = llama_runtime.GetTokenBatchSize();
+  size_t cur_token_index = 0;
+
+  Timer timer_digest_prompt([=](const auto elapsed_sec) {
+    // Ideal prompt size is a multiple of prompt batch size
+    const size_t ideal_prompt_size =
+        std::ceil(float(input_token_count) / prompt_token_batch_size) *
+        prompt_token_batch_size;
+    ET_LOG(
+        Info,
+        "Done analyzing prompt in %f sec (%f tok/s)",
+        elapsed_sec,
+        (float)ideal_prompt_size / elapsed_sec);
+  });
+
+  auto getNextTokens = [&]() {
+    const size_t num_tok_remain = input_token_count - cur_token_index;
+    const size_t remainder = num_tok_remain % prompt_token_batch_size;
+    const size_t num_new_tokens =
+        remainder ? remainder : prompt_token_batch_size;
+    const auto start = cur_token_index;
+    const auto end = start + num_new_tokens;
+    return std::vector(
+        input_tokens.begin() + start, input_tokens.begin() + end);
+  };
+
+  void* logits;
+  timer_digest_prompt.Start();
+  while (cur_token_index < input_token_count) {
+    const auto next_tokens = getNextTokens();
+    ET_LOG(
+        Debug,
+        "Digest next tokens (size=%zu), 1st tok=%lu",
+        next_tokens.size(),
+        next_tokens[0]);
+    logits = llama_runtime.Run(next_tokens);
+    cur_token_index += next_tokens.size();
+  }
+  timer_digest_prompt.End();
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+  const auto first_output_token =
+      utils::argmax(logits_type, logits, vocab_size);
+  return first_output_token;
+}
+
+Error MTKLlamaRunner::gen_response(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const uint64_t input_token,
+    std::function<void(const std::string&)> token_callback) {
+  Timer timer_model_swap(
+      [](const auto elapsed_sec) { ET_LOG(Info, "Model swapped."); });
+
+  // Swap to gen mode
+  timer_model_swap.Start();
+  llama_runtime.SwapModel(1);
+  timer_model_swap.End();
+
+  size_t gen_tok_count = 0;
+  uint64_t prev_token = input_token;
+  uint64_t output_token = input_token;
+
+  auto decode_res = tokenizer->decode(prev_token, output_token);
+  ET_CHECK_OR_RETURN_ERROR(
+      decode_res.ok(),
+      InvalidState,
+      "Tokenizer failed to decode first generated token: %lu",
+      output_token);
+  std::string full_response = std::move(decode_res.get());
+  std::vector<uint64_t> full_response_tokens = {input_token};
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+
+  double gen_total_time_sec = 0;
+  Timer timer_gen_token(
+      [&](const auto elapsed_sec) { gen_total_time_sec += elapsed_sec; });
+
+  // Print first output token
+  token_callback(full_response);
+
+  while (gen_tok_count++ < MAX_RESPONSE &&
+         llama_runtime.GetTokenIndex() < modeloptions_.max_token_length) {
+    timer_gen_token.Start();
+    void* logits = llama_runtime.Run({output_token});
+    timer_gen_token.End();
+
+    prev_token = output_token;
+    output_token = utils::argmax(logits_type, logits, vocab_size);
+    full_response_tokens.push_back(output_token);
+
+    // Stop when output is EOS
+    if (output_token == tokenizer->eos_tok()) {
+      token_callback("</eos>");
+      break;
+    }
+    auto decode_res = tokenizer->decode(prev_token, output_token);
+    ET_CHECK_OR_RETURN_ERROR(
+        decode_res.ok(),
+        InvalidState,
+        "Tokenizer failed to decode generated token %lu",
+        output_token);
+    const std::string tok_str = std::move(decode_res.get());
+    full_response += tok_str;
+    token_callback(tok_str);
+  }
+
+  std::cout << "\n\n[Generated Tokens]\n"
+            << utils::to_string(full_response_tokens) << std::endl;
+
+  ET_LOG(
+      Info,
+      "Token generation speed: %f tok/s",
+      gen_tok_count / gen_total_time_sec);
+
+  return Error::Ok;
+}
+
+Error MTKLlamaRunner::inference(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::string& prompt,
+    std::function<void(const std::string&)> token_callback) {
+  // Tokenize input prompt
+  auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos);
+  ET_CHECK_OR_RETURN_ERROR(
+      encode_res.ok(), InvalidState, "Tokenizer failed to encode prompt");
+  const auto input_tokens = std::move(encode_res.get());
+
+  // Run prompt mode (pre-fill)
+  auto prefill_res = digest_prompt(llama_runtime, tokenizer, input_tokens);
+  ET_CHECK_OR_RETURN_ERROR(
+      prefill_res.ok(), InvalidState, "Failed to digest prompt");
+  const auto first_output_token = prefill_res.get();
+
+  // run generation mode (decoding)
+  return gen_response(llama_runtime, tokenizer, first_output_token, token_callback);
+}
+
+std::unique_ptr<Tokenizer> MTKLlamaRunner::load_tokenizer() {
+  std::unique_ptr<Tokenizer> tokenizer;
+  // Assumes that tokenizer type is Tiktoken
+  tokenizer = torch::executor::get_tiktoken_for_llama();
+  tokenizer->load(modelpaths_.tokenizer_path);
+  return tokenizer;
+}
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
new file mode 100644
index 00000000000..d9f85c20257
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
+
+#include "llama_runner/LlamaConfig.h"
+#include "llama_runner/LlamaRuntime.h"
+using namespace torch::executor;
+using Stats = ::executorch::llm::Stats;
+
+class MTKLlamaRunner {
+ public:
+  explicit MTKLlamaRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature = 0.8f);
+
+  bool is_loaded() const;
+  Error load();
+  Error generate(
+      const std::string& prompt,
+      int32_t seq_len = 128,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
+  void stop();
+
+  LlamaModelOptions get_model_options();
+  LlamaModelPaths get_model_paths();
+  Result<uint64_t> digest_prompt(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::vector<uint64_t> input_tokens);
+  Error gen_response(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const uint64_t input_token,
+      std::function<void(const std::string&)> token_callback);
+  Error inference(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::string& prompt,
+      std::function<void(const std::string&)> token_callback);
+  std::unique_ptr<Tokenizer> load_tokenizer();
+
+
+ private:
+  // model
+  const torch::executor::LlamaModelOptions modeloptions_;
+  const torch::executor::LlamaModelPaths modelpaths_;
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<LlamaRuntime> runtime_;
+};
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index ab1f3650102..9a1a14b113a 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -94,6 +94,26 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${EXECUTORCH_ROOT}/examples/models/llama2/runner
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner
   )
+
+  target_sources(
+    executorch_jni PRIVATE
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+  )
+  target_include_directories(
+    executorch_jni PRIVATE
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
+  )
+  ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so)
+  list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
 endif()
 
 if(TARGET quantized_kernels)
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index e6a9b5de58c..50476df5690 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -17,6 +17,7 @@
 
 #include <executorch/examples/models/llama2/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -68,13 +69,15 @@ class ExecuTorchLlamaJni
   int model_type_category_;
   std::unique_ptr<Runner> runner_;
   std::unique_ptr<MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<MTKLlamaRunner> mtk_llama_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
       "Lorg/pytorch/executorch/LlamaModule;";
 
-  constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
+  constexpr static int MODEL_TYPE_CATEGORY_LLM = 3 /* should be put back to 1*/;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
+  constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */;
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
@@ -113,6 +116,11 @@ class ExecuTorchLlamaJni
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
+    } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_ = std::make_unique<MTKLlamaRunner>(
+        model_path->toStdString().c_str(),
+        tokenizer_path->toStdString().c_str(),
+        temperature);
     }
   }
 
@@ -152,6 +160,12 @@ class ExecuTorchLlamaJni
           [callback](std::string result) { callback->onResult(result); },
           [callback](const Stats& result) { callback->onStats(result); },
           echo);
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->generate(
+          prompt->toStdString(),
+          seq_len,
+          [callback](std::string result) { callback->onResult(result); },
+          [callback](const Stats& result) { callback->onStats(result); });
     }
     return 0;
   }
@@ -243,6 +257,8 @@ class ExecuTorchLlamaJni
       multi_modal_runner_->stop();
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->stop();
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->stop();
     }
   }
 
@@ -251,6 +267,8 @@ class ExecuTorchLlamaJni
       return static_cast<jint>(multi_modal_runner_->load());
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       return static_cast<jint>(runner_->load());
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      return static_cast<jint>(mtk_llama_runner_->load());
     }
     return static_cast<jint>(Error::InvalidArgument);
   }