diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 12968fa28c9..99dee7c9a7b 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
   hash_val ^= std::hash<size_t>()(info.pos);
   hash_val ^= std::hash<size_t>()(info.tensor_bytes);
   for (int i = 0; i < info.rank; ++i) {
-    hash_val ^= info.shape[i];
+    hash_val ^= std::hash<uint32_t>()(info.shape[i]);
   }
   hash_val ^= std::hash<uint32_t>()(info.rank);
   hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 29e6686740b..f488800441b 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -78,7 +78,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->soc_info(),
           htp_options);
       backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
-          implementation, backend_params->qnn_context_ptr_.get());
+          implementation,
+          backend_params->qnn_context_ptr_.get(),
+          options->log_level());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
     } break;
     case QnnExecuTorchBackendType::kGpuBackend:
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
index 5f13e9b3ba6..e09d071075b 100644
--- a/backends/qualcomm/runtime/backends/QnnMemManager.cpp
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
@@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to ION shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to ION shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
+
   return Error::Ok;
 }
 
@@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to custom shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to custom shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
   return Error::Ok;
 }
 
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
index a0bdafab7b5..30bb64d78ad 100644
--- a/backends/qualcomm/runtime/backends/QnnMemManager.h
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -21,8 +21,11 @@ class QnnMemManager {
  public:
   explicit QnnMemManager(
       const QnnImplementation& implementation,
-      QnnContext* context)
-      : implementation_(implementation), context_(context) {}
+      QnnContext* context,
+      QnnExecuTorchLogLevel log_level)
+      : implementation_(implementation),
+        context_(context),
+        log_level_(log_level) {}
   ~QnnMemManager() {
     DeRegisterMem();
   }
@@ -63,6 +66,7 @@ class QnnMemManager {
 
   const QnnImplementation& implementation_;
   QnnContext* context_;
+  QnnExecuTorchLogLevel log_level_;
   std::unordered_map<Qnn_MemHandle_t, void*> registered_map_;
   std::unordered_map<CustomMemTensorInfo, void*> pre_registered_handles_;
   std::unordered_map<executorch::aten::ScalarType, Qnn_DataType_t>
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 7d097fd45bf..20a9479897b 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3463,7 +3463,7 @@ def test_llama3_2_1b(self):
         if self.pre_gen_pte:
             cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
 
-        golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
+        golden_start_with = "<|start_header_id|>user<|end_header_id|>"
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
             conn = listener.accept()
diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
index 4d4f1c2e39d..a691cda44d3 100644
--- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -28,8 +28,18 @@ list(
   ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/imem_alloc.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/client_mem.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h
 )
 
 list(
@@ -42,7 +52,7 @@ list(
 # build qnn llama runner
 add_executable(qnn_llama_runner ${_llama_runner__srcs})
 target_include_directories(
-  qnn_llama_runner PUBLIC ${_common_include_directories}
+  qnn_llama_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
 )
 
 target_link_options_shared_lib(quantized_ops_lib)
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 375edf9fb6c..86de35a4c99 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -403,7 +403,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
         logging.info("Quantizing the model...")
         calibrate(
             self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
-            args.prompt,
+            args.prompt[0],
             fx_graph_module,
             tokenizer=tokenizer,
             ar_len=self.llama_meta["get_ar_len"],
@@ -828,7 +828,7 @@ def permute(w, heads):
     return quant_attrs
 
 
-def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
+def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
     if args.model_mode == "kv":
@@ -854,14 +854,13 @@ def post_process():
             outputs.append(f.read())
 
     seq_len = args.max_seq_len
+    multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt])
     runner_args = " ".join(
         [
-            f'--prompt "{args.prompt}"',
+            multi_prompts,
             f"--eval_mode {eval_mode}",
             f"--temperature {args.temperature}",
             f"--system_prompt '{args.system_prompt}'",
-            f"--logits_scale {quant_attrs['scale']}",
-            f"--logits_offset {quant_attrs['zero_point']}",
         ]
     )
 
@@ -1004,9 +1003,10 @@ def _build_parser():
 
     parser.add_argument(
         "--prompt",
-        help="User prompts for llama.",
+        help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
         required=True,
         type=str,
+        nargs="+",
     )
 
     parser.add_argument(
@@ -1090,7 +1090,7 @@ def _build_parser():
 
 def export_llama(args) -> None:
     if args.compile_only and args.pre_gen_pte:
-        exit("Cannot set both compile_only and pre_gen_pte as true")
+        raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true")
 
     if args.model_mode == "kv":
         pte_filename = "kv_llama_qnn"
@@ -1126,29 +1126,15 @@ def export_llama(args) -> None:
     elif args.kv_updater == "shift_pointer":
         args.kv_updater = shift_pointer_updater
     else:
-        exit(f"Using an unkown kv update {args.kv_updater}")
+        raise RuntimeError(f"Using an unknown kv update {args.kv_updater}")
 
     if args.pre_gen_pte:
-        quant_attrs = json.load(
-            open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt")
-        )
-        inference(
-            args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte
-        )
-        exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
+        inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte)
+        print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
+        return
 
     if args.compile_only:
-        quant_attrs = compile(args, pte_filename, tokenizer)
-        if quant_attrs:
-            json.dump(
-                {
-                    "scale": quant_attrs["scale"],
-                    "zero_point": quant_attrs["zero_point"],
-                },
-                open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
-            )
-        else:
-            logging.warning("Quant attributes of the logit is None.")
+        compile(args, pte_filename, tokenizer)
 
         if args.ip and args.port != -1:
             pte_path = f"{args.artifact}/{pte_filename}.pte"
@@ -1161,24 +1147,18 @@ def export_llama(args) -> None:
                         }
                     )
                 )
-        exit(f"Finish compile_only and save to {args.artifact}")
+        print(f"Finish compile_only and save to {args.artifact}")
+        return
+
+    compile(args, pte_filename, tokenizer)
+    inference(args, pte_filename, runtime_tokenizer_path)
 
+
+def main():
+    parser = _build_parser()
+    args = parser.parse_args()
     try:
-        quant_attrs = compile(args, pte_filename, tokenizer)
-        if quant_attrs:
-            logging.info(
-                f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}"
-            )
-            json.dump(
-                {
-                    "scale": quant_attrs["scale"],
-                    "zero_point": quant_attrs["zero_point"],
-                },
-                open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
-            )
-        else:
-            logging.warning("Quant attributes of the logit is None.")
-        inference(args, quant_attrs, pte_filename, runtime_tokenizer_path)
+        export_llama(args)
     except Exception as e:
         if args.ip and args.port != -1:
             with Client((args.ip, args.port)) as conn:
@@ -1187,12 +1167,6 @@ def export_llama(args) -> None:
             raise Exception(e)
 
 
-def main():
-    parser = _build_parser()
-    args = parser.parse_args()
-    export_llama(args)
-
-
 # flake8: noqa: C901
 if __name__ == "__main__":
     main()
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index f23cf2ec44a..938d298d077 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -34,7 +34,10 @@ DEFINE_string(
     "inference_speed.txt",
     "Records inference speed. For CI purpose.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
-DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+DEFINE_string(
+    prompt,
+    "The answer to the ultimate question is",
+    "User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.");
 DEFINE_string(
     system_prompt,
     "",
@@ -49,10 +52,8 @@ DEFINE_int32(
     "Total number of tokens to generate (prompt + output).");
 DEFINE_int32(
     eval_mode,
-    1,
+    0,
     "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)");
-DEFINE_double(logits_scale, 0.0, "Logits scale");
-DEFINE_int32(logits_offset, 0, "Logits offset");
 DEFINE_string(
     kv_updater,
     "How to update kv cache. Choose between SmartMask and ShiftPointer",
@@ -72,20 +73,46 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
   return prompts;
 }
 
+std::string get_formatted_prompt(
+    const std::string& prompt,
+    const std::string& system_prompt,
+    example::LlamaVersion llama_version) {
+  std::string formatted_prompt;
+  switch (llama_version) {
+    case example::LlamaVersion::kLlama2:
+      formatted_prompt.append(prompt);
+      break;
+    case example::LlamaVersion::kLlama3:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append(
+            "<|start_header_id|>system<|end_header_id|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|eot_id|>");
+      }
+      formatted_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n");
+      formatted_prompt.append(prompt);
+      formatted_prompt.append(
+          "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported llama version");
+      break;
+  }
+  return formatted_prompt;
+}
+
 int main(int argc, char** argv) {
   std::vector<std::string> prompts = CollectPrompts(argc, argv);
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   // create llama runner
   example::Runner runner(
-      {FLAGS_model_path},
+      FLAGS_model_path.c_str(),
       FLAGS_tokenizer_path.c_str(),
       FLAGS_performance_output_path.c_str(),
-      FLAGS_logits_scale,
-      FLAGS_logits_offset,
       FLAGS_temperature,
       FLAGS_eval_mode,
-      FLAGS_kv_updater,
-      FLAGS_num_iters);
+      FLAGS_kv_updater);
+  auto llama_version = runner.get_llama_version();
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
@@ -97,8 +124,10 @@ int main(int argc, char** argv) {
   // generate tokens & store inference output
   for (int i = 0; i < FLAGS_num_iters; i++) {
     for (const auto& prompt : prompts) {
-      runner.generate(
-          FLAGS_seq_len, prompt.c_str(), FLAGS_system_prompt.c_str(), callback);
+      std::string formatted_prompt;
+      formatted_prompt = get_formatted_prompt(
+          prompt, FLAGS_system_prompt, llama_version.get());
+      runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback);
     }
   }
   fout.write(buf.data(), buf.size());
diff --git a/examples/qualcomm/oss_scripts/llama/runner/client_mem.h b/examples/qualcomm/oss_scripts/llama/runner/client_mem.h
new file mode 100644
index 00000000000..0fd535796de
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/client_mem.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <vector>
+
+namespace example {
+/**
+ * @class ClientMem
+ * @brief Final class for client buffer allocation, implementing IBufferAlloc
+ * interface. Used for SHIFT_POINTER mode.
+ */
+class ClientMem final : public IMemAlloc {
+ public:
+  ClientMem(){};
+  // Disable copy constructors, r-value referencing, etc
+  ClientMem(const ClientMem&) = delete;
+  ClientMem& operator=(const ClientMem&) = delete;
+  ClientMem(ClientMem&&) = delete;
+  ClientMem& operator=(ClientMem&&) = delete;
+  virtual ~ClientMem(){};
+  /**
+   * @brief Allocate buffer of specified size with vector.
+   * @param data_size Size of the data to allocate.
+   * @return Pointer to the allocated buffer.
+   */
+  std::byte* allocate(size_t data_size) override {
+    allocated_buffers_.push_back(std::vector<std::byte>(data_size));
+    return allocated_buffers_.back().data();
+  };
+  // Only used for SMART_MASK mode
+  void add_memory_info(
+      void* data_ptr,
+      size_t data_size,
+      executorch::runtime::TensorInfo tensor_info) override {};
+
+ private:
+  std::vector<std::vector<std::byte>> allocated_buffers_;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp
new file mode 100644
index 00000000000..ec5d9746daa
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given inputs, run a text decoder and return logits.
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
+
+#include <ctime>
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::extension::llm::Sampler;
+using executorch::llm::kTopp;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+
+namespace example {
+
+DecoderRunner::DecoderRunner(
+    Module* module,
+    int32_t vocab_size,
+    float temperature)
+    : module_(module),
+      sampler_(std::make_unique<Sampler>(
+          vocab_size,
+          temperature,
+          kTopp,
+          static_cast<unsigned long long>(std::time(nullptr)))) {}
+
+Error DecoderRunner::set_outputs(
+    const std::string& method_name,
+    std::vector<executorch::aten::Tensor> output_values) {
+  for (size_t i = 0; i < output_values.size(); ++i) {
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        module_->set_output(method_name, output_values[i], i));
+  }
+  return Error::Ok;
+}
+
+Error DecoderRunner::load(const std::vector<std::string>& method_names) {
+  if (is_method_loaded(method_names)) {
+    return Error::Ok;
+  }
+  for (const std::string& method_name : method_names) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name));
+  }
+  return Error::Ok;
+}
+
+bool DecoderRunner::is_method_loaded(
+    const std::vector<std::string>& method_names) {
+  bool method_loaded = true;
+  for (const std::string& method_name : method_names) {
+    method_loaded &= module_->is_method_loaded(method_name);
+  }
+  return method_loaded;
+}
+
+// This function is functional, meaning it shouldn't modify any state of the
+// input. It should be safe to call multiple times with the same inputs. The
+// outer loop (call site) is responsible for managing state.
+Result<Tensor> DecoderRunner::step(
+    const std::string& method_name,
+    std::vector<EValue>& inputs) {
+  Result<std::vector<EValue>> outputs_res =
+      module_->execute(method_name, inputs);
+  ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+  ET_CHECK_MSG(
+      outputs_res.get()[0].isTensor(),
+      "Non Tensor Output returned from executing LLM");
+
+  // Return the logits tensor
+  return outputs_res.get()[0].toTensor();
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h
new file mode 100644
index 00000000000..888e9acd421
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+namespace example {
+class DecoderRunner {
+ public:
+  DecoderRunner(
+      executorch::extension::Module* module,
+      int32_t vocab_size,
+      float temperature);
+  /**
+   * Run LLM text decoder with inputs to generate next token.
+   * @param inputs The inputs to the LLM Module.
+   * @return The output of the LLM Module. This will be a tensor of logits.
+   */
+  executorch::runtime::Result<executorch::aten::Tensor> step(
+      const std::string& method_name,
+      std::vector<executorch::runtime::EValue>& inputs);
+
+  /**
+   * Once KV Cache output data pointer change, need to set
+   * the output for specify method name in the module.
+   * @return The error code.
+   */
+  executorch::runtime::Error set_outputs(
+      const std::string& method_name,
+      std::vector<executorch::aten::Tensor> output_values);
+
+  /**
+   * Load the Module for text decode purpose.
+   * @return The error code.
+   */
+  executorch::runtime::Error load(const std::vector<std::string>& method_names);
+  /**
+   * Check if the required methods in the Module is loaded.
+   * @return True if the Module is loaded, false otherwise.
+   */
+  bool is_method_loaded(const std::vector<std::string>& method_names);
+
+  /**
+   * Sample the next token from the logits tensor.
+   * @param logits_tensor The logits tensor.
+   * @return The next token.
+   */
+  inline int32_t logits_to_token(
+      const executorch::aten::Tensor& logits_tensor,
+      int64_t pos) {
+    auto* logits = logits_tensor.mutable_data_ptr<uint16_t>();
+    auto num_tokens = logits_tensor.size(1);
+    auto vocab_size = logits_tensor.size(2);
+    static std::vector<float> logits_f(vocab_size);
+    auto* logits_last = logits;
+    // offset to the meaningful logit we want for prefill model.
+    if (num_tokens > 1) {
+      logits_last += pos * vocab_size;
+    }
+    // Discard dequantization (converting uint16_t to float) because the
+    // relative order of elements remains the same without conversion
+    for (int i = 0; i < vocab_size; i++) {
+      logits_f[i] = logits_last[i];
+    }
+    return sampler_->sample(logits_f.data());
+  }
+
+ protected:
+  executorch::extension::Module* module_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h b/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h
new file mode 100644
index 00000000000..59680256a29
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/runtime/executor/method_meta.h>
+#include <cstddef>
+
+namespace example {
+/**
+ * @class IMemAlloc
+ * @brief Interface for buffer allocation.
+ */
+class IMemAlloc {
+ public:
+  IMemAlloc(){};
+  virtual ~IMemAlloc(){};
+  virtual std::byte* allocate(size_t data_size) = 0;
+  virtual void add_memory_info(
+      void* data_ptr,
+      size_t data_size,
+      executorch::runtime::TensorInfo tensor_info) = 0;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
deleted file mode 100644
index c2bf7b04fbb..00000000000
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
+++ /dev/null
@@ -1,1435 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/io_manager.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <algorithm>
-
-using executorch::aten::Tensor;
-using executorch::aten::TensorImpl;
-using executorch::extension::Module;
-using executorch::runtime::Error;
-using executorch::runtime::MemoryAllocator;
-using executorch::runtime::MethodMeta;
-using executorch::runtime::Result;
-using executorch::runtime::TensorInfo;
-
-namespace example {
-
-IoMgrBase::IoMgrBase(std::vector<std::shared_ptr<Module>>& modules)
-    : data_ptr_(nullptr, [](void*) {}), modules_(modules) {}
-
-IoMgrBase::~IoMgrBase() {}
-
-void* IoMgrBase::get_mutable_ptr() {
-  return data_ptr_.get();
-}
-
-std::vector<Tensor> IoMgrBase::get_input_tensors(
-    int shard_index,
-    const std::string& method_name) {
-  std::vector<Tensor> ret;
-  ret.reserve(input_tensors_.size());
-  for (TensorImpl* impl : input_tensors_[method_name][shard_index]) {
-    ret.emplace_back(Tensor(impl));
-  }
-  return ret;
-}
-
-std::vector<Tensor> IoMgrBase::get_output_tensors(
-    int shard_index,
-    const std::string& method_name) {
-  std::vector<Tensor> ret;
-  ret.reserve(output_tensors_[method_name][shard_index].size());
-  for (TensorImpl* impl : output_tensors_[method_name][shard_index]) {
-    ret.emplace_back(Tensor(impl));
-  }
-  return ret;
-}
-
-ShiftPointerIoMgr::ShiftPointerIoMgr(
-    std::vector<std::shared_ptr<Module>>& modules,
-    int32_t context_len,
-    int32_t prefill_ar_len,
-    int32_t prefill_cache_len,
-    int32_t kv_ar_len,
-    int32_t kv_cache_len,
-    int32_t vocab_size,
-    int32_t num_layers,
-    int32_t head_dim,
-    int32_t num_heads,
-    EvalMode eval_mode,
-    const std::string& prefill_forward_name,
-    const std::string& kv_forward_name,
-    const bool use_int64_token)
-    : IoMgrBase(modules),
-      shard_layers_({num_layers}),
-      context_len_(context_len),
-      kv_ar_len_(kv_ar_len),
-      kv_cache_len_(kv_cache_len),
-      prefill_ar_len_(prefill_ar_len),
-      prefill_cache_len_(prefill_cache_len),
-      vocab_size_(vocab_size),
-      num_layers_(num_layers),
-      head_dim_(head_dim),
-      num_heads_(num_heads),
-      eval_mode_(eval_mode),
-      prefill_forward_name_(prefill_forward_name),
-      kv_forward_name_(kv_forward_name),
-      use_int64_token_(use_int64_token) {
-  if (!prefill_forward_name_.empty()) {
-    input_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-  if (!kv_forward_name_.empty()) {
-    input_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-
-  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
-      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
-}
-
-void ShiftPointerIoMgr::init_io() {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::memset(ptr, 0, sizeof(IO));
-
-  int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_);
-  int32_t k_in_size = (head_dim_ + 1) * kv_cache_len_;
-  // Use context length to prevent exceeding the range when the AR-N model
-  // updates the last block in hybrid mode.
-  int32_t v_cache_size = (num_heads_ + 1) * context_len_ * head_dim_;
-  int32_t k_cache_out_size = num_heads_ * max_ar_len * head_dim_;
-
-  // Init kv vector shape, general enough to be shared across all modes.
-  ptr->k_cache_out.reserve(num_layers_);
-  ptr->v_cache.reserve(num_layers_);
-  for (int layer = 0; layer < num_layers_; layer++) {
-    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_cache_out_size));
-    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
-  }
-
-  auto init_prefill = [&]() {
-    ptr->prefill_input_toks.resize(prefill_ar_len_, 0);
-    ptr->prefill_input_pos.resize(prefill_ar_len_, 0);
-    ptr->prefill_attention_mask.resize((prefill_ar_len_ * context_len_), 0);
-    ptr->prefill_logits.resize(prefill_ar_len_ * vocab_size_);
-  };
-
-  auto init_kv = [&]() {
-    ptr->kv_logits.resize(kv_ar_len_ * vocab_size_);
-    ptr->kv_attention_mask.resize((kv_ar_len_ * context_len_), 0);
-    ptr->k_cache.reserve(num_layers_);
-    for (int layer = 0; layer < num_layers_; layer++) {
-      ptr->k_cache.emplace_back();
-      ptr->k_cache[layer].reserve(num_heads_);
-      for (int head = 0; head < num_heads_; head++) {
-        ptr->k_cache[layer].emplace_back(std::vector<uint8_t>(k_in_size));
-      }
-    }
-  };
-
-  switch (eval_mode_) {
-    case EvalMode::kKVCached:
-      init_kv();
-      break;
-    case EvalMode::kHybrid:
-      init_prefill();
-      init_kv();
-      break;
-    default:
-      break;
-  }
-}
-
-void ShiftPointerIoMgr::reset_io(
-    const std::vector<executorch::runtime::Result<
-        executorch::runtime::MethodMeta>>& prefill_methods_meta,
-    const std::vector<
-        executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-        kv_methods_meta) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::fill(ptr->prefill_input_pos.begin(), ptr->prefill_input_pos.end(), 0);
-  ptr->kv_input_pos = 0;
-  std::fill(
-      ptr->prefill_attention_mask.begin(),
-      ptr->prefill_attention_mask.end(),
-      0);
-  std::fill(ptr->kv_attention_mask.begin(), ptr->kv_attention_mask.end(), 0);
-
-  input_tensors_[kv_forward_name_].clear();
-  input_tensors_[kv_forward_name_].resize(modules_.size());
-  output_tensors_[kv_forward_name_].clear();
-  output_tensors_[kv_forward_name_].resize(modules_.size());
-
-  k_cache_in_[kv_forward_name_].clear();
-  v_cache_in_[kv_forward_name_].clear();
-  k_cache_out_[kv_forward_name_].clear();
-  v_cache_out_[kv_forward_name_].clear();
-
-  input_tensors_[prefill_forward_name_].clear();
-  input_tensors_[prefill_forward_name_].resize(modules_.size());
-  output_tensors_[prefill_forward_name_].clear();
-  output_tensors_[prefill_forward_name_].resize(modules_.size());
-
-  k_cache_in_[prefill_forward_name_].clear();
-  v_cache_in_[prefill_forward_name_].clear();
-  k_cache_out_[prefill_forward_name_].clear();
-  v_cache_out_[prefill_forward_name_].clear();
-
-  switch (eval_mode_) {
-    case EvalMode::kKVCached:
-      prepare_kv_io(kv_methods_meta);
-      break;
-    case EvalMode::kHybrid:
-      prepare_prefill_io(prefill_methods_meta);
-      prepare_kv_io(kv_methods_meta);
-      break;
-    default:
-      ET_CHECK_MSG(false, "unsupported mode");
-      break;
-  }
-}
-void ShiftPointerIoMgr::prepare_kv_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  // [I]: input_tokens
-  Result<TensorInfo> kv_input_toks = methods_meta[0]->input_tensor_meta(0);
-  kv_input_toks_ = std::make_unique<TensorImpl>(
-      kv_input_toks->scalar_type(),
-      kv_input_toks->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_input_toks->sizes().data()),
-      &ptr->kv_input_toks,
-      const_cast<TensorImpl::DimOrderType*>(kv_input_toks->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get());
-
-  // [I]: atten_mask
-  Result<TensorInfo> kv_attention_mask = methods_meta[0]->input_tensor_meta(1);
-  kv_attention_mask_ = std::make_unique<TensorImpl>(
-      kv_attention_mask->scalar_type(),
-      kv_attention_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_attention_mask->sizes().data()),
-      ptr->kv_attention_mask.data(),
-      const_cast<TensorImpl::DimOrderType*>(
-          kv_attention_mask->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get());
-
-  // [I]: input_pos
-  Result<TensorInfo> kv_input_pos = methods_meta[0]->input_tensor_meta(2);
-  kv_input_pos_ = std::make_unique<TensorImpl>(
-      kv_input_pos->scalar_type(),
-      kv_input_pos->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_input_pos->sizes().data()),
-      &ptr->kv_input_pos,
-      const_cast<TensorImpl::DimOrderType*>(kv_input_pos->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get());
-
-  // [I] kv_cache
-  int index = 3; // bypass input_tokens, atten_mask, input_pos
-  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
-       shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->input_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
-                                : v_cache_in_[kv_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() + head * v_stride);
-
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          input_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  kv_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->kv_logits.data(),
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
-      kv_logits_.get());
-
-  // [O] kv_cache
-  index = 1;
-  // Iterate through all kv cache outputs.
-  // For k, we store it in k_cache_out and update to k_cache later.
-  // For v, we append the output to the end of v_cache,
-  // which serves as both input and output.
-  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
-       shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
-                                : v_cache_out_[kv_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(
-                    ptr->k_cache_out[layer + offset].data() +
-                    (head * head_dim_))
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() +
-                    (head + 1) * v_stride);
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          output_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void ShiftPointerIoMgr::prepare_prefill_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(
-      !(prefill_forward_name_.empty()), "prefill forward name is empty");
-
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  // [I]: prefill_input_tokens
-  Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
-  prefill_input_toks_ = std::make_unique<TensorImpl>(
-      prefill_input_toks->scalar_type(),
-      prefill_input_toks->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_input_toks->sizes().data()),
-      ptr->prefill_input_toks.data(),
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_input_toks->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
-  // [I]: prefill_attention_mask
-  for (int i = 0; i < prefill_ar_len_; ++i) {
-    for (int j = 0,
-             offset = i * context_len_ + (context_len_ - prefill_ar_len_);
-         j < prefill_ar_len_;
-         ++j) {
-      if (i >= j) {
-        ptr->prefill_attention_mask[j + offset] = 65535;
-      }
-    }
-  }
-  Result<TensorInfo> prefill_attention_mask =
-      methods_meta[0]->input_tensor_meta(1);
-  prefill_attention_mask_ = std::make_unique<TensorImpl>(
-      prefill_attention_mask->scalar_type(),
-      prefill_attention_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(
-          prefill_attention_mask->sizes().data()),
-      ptr->prefill_attention_mask.data(),
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_attention_mask->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(
-      prefill_attention_mask_.get());
-
-  if (!is_bert()) {
-    // [I]: prefill_input_pos
-    Result<TensorInfo> prefill_input_pos =
-        methods_meta[0]->input_tensor_meta(2);
-    prefill_input_pos_ = std::make_unique<TensorImpl>(
-        prefill_input_pos->scalar_type(),
-        prefill_input_pos->sizes().size(),
-        const_cast<TensorImpl::SizesType*>(prefill_input_pos->sizes().data()),
-        ptr->prefill_input_pos.data(),
-        const_cast<TensorImpl::DimOrderType*>(
-            prefill_input_pos->dim_order().data()));
-    input_tensors_[prefill_forward_name_][0].push_back(
-        prefill_input_pos_.get());
-
-    // [I] kv_cache
-    int index = 3; // bypass input_tokens, atten_mask, input_pos
-    // Add prefill offset to align the v_out pointer with the decode model.
-    for (int offset = 0,
-             shard_index = 0,
-             v_stride = kv_cache_len_ * head_dim_,
-             prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_;
-         shard_index < modules_.size();
-         offset += shard_layers_[shard_index], shard_index++) {
-      for (int cache_group = 0; cache_group < 2; ++cache_group) {
-        for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-          for (int head = 0; head < num_heads_; ++head, ++index) {
-            Result<TensorInfo> kv_cache =
-                methods_meta[shard_index]->input_tensor_meta(index);
-            std::vector<std::unique_ptr<TensorImpl>>& cache =
-                (cache_group == 0 ? k_cache_in_[prefill_forward_name_]
-                                  : v_cache_in_[prefill_forward_name_]);
-            void* cache_ptr = (cache_group == 0)
-                ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
-                : static_cast<void*>(
-                      ptr->v_cache[layer + offset].data() + head * v_stride +
-                      prefill_offset);
-
-            cache.emplace_back(std::make_unique<TensorImpl>(
-                kv_cache->scalar_type(),
-                kv_cache->sizes().size(),
-                const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-                cache_ptr,
-                const_cast<TensorImpl::DimOrderType*>(
-                    kv_cache->dim_order().data())));
-            input_tensors_[prefill_forward_name_][shard_index].push_back(
-                cache.back().get());
-          }
-        }
-      }
-    }
-  }
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  prefill_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->prefill_logits.data(),
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
-      prefill_logits_.get());
-
-  // [O] kv_cache
-  int index = 1;
-  // In hybrid mode, we use kv mode cache len for v stride since we want to
-  // update prefill's result onto kv modes input.
-  int32_t prefill_k_stride = prefill_ar_len_ * head_dim_;
-  int32_t prefill_v_stride = kv_cache_len_ * head_dim_;
-
-  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
-                                : v_cache_out_[prefill_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(
-                    ptr->k_cache_out[layer + offset].data() +
-                    head * prefill_k_stride)
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() +
-                    (head + 1) * prefill_v_stride);
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          output_tensors_[prefill_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void ShiftPointerIoMgr::update_prefill_to_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0");
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  ptr->kv_input_pos = static_cast<int32_t>(pos);
-  // If prompt len is 30, prefill will handle to pos = 30.
-  // At this point, pos should be 31.
-  for (int i = 0; i < pos + 1; i++) {
-    ptr->kv_attention_mask[kv_cache_len_ - i] = 65535;
-  }
-
-  // update v_cache
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in =
-      v_cache_in_[kv_forward_name_];
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out =
-      v_cache_out_[kv_forward_name_];
-  for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size();
-       i++) {
-    v_cache_in[i]->set_data(
-        v_cache_in[i]->mutable_data<uint8_t>() + v_cache_stride);
-    v_cache_out[i]->set_data(
-        v_cache_out[i]->mutable_data<uint8_t>() + v_cache_stride);
-  }
-  for (int shard = 0; shard < output_tensors.size(); shard++) {
-    for (int index = 0; index < output_tensors[shard].size(); index++) {
-      ET_CHECK_MSG(
-          modules_[shard]->set_output(
-              kv_forward_name_, output_tensors[shard][index], index) ==
-              Error::Ok,
-          "Failed to set output tensor for module %d's %d'th output "
-          "while updating kv_cache output tensors",
-          shard,
-          index);
-    }
-  }
-
-  // Update k_cache
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in =
-      k_cache_in_[kv_forward_name_];
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_out =
-      k_cache_out_[prefill_forward_name_];
-  // copy from last to prevent from overwriting values
-  size_t copied_size = pos * sizeof(uint8_t);
-  for (int i = 0; i < k_cache_in.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-    if (is_bert()) {
-      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-      for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
-           ++j, offset += kv_cache_len_) {
-        for (int k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) {
-          ptr_in[offset + k] = ptr_out[k_stride + k];
-        }
-      }
-    } else {
-      for (int j = head_dim_; j > -1; --j) {
-        memcpy(
-            ptr_in + j * kv_cache_len_,
-            ptr_in + j * prefill_cache_len_,
-            copied_size);
-      }
-    }
-    k_cache_in[i]->set_data(ptr_in + pos);
-  }
-}
-
-void ShiftPointerIoMgr::update_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  // update input_tok
-  ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  // update position_ids
-  ptr->kv_input_pos = static_cast<int32_t>(pos);
-  // update causal mask for next token
-  ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535;
-
-  // update v_cache
-  auto& v_cache_in = v_cache_in_[kv_forward_name_];
-  auto& v_cache_out = v_cache_out_[kv_forward_name_];
-  for (int i = 0; i < v_cache_in.size(); i++) {
-    v_cache_in[i]->set_data(v_cache_in[i]->mutable_data<uint8_t>() + head_dim_);
-    v_cache_out[i]->set_data(
-        v_cache_out[i]->mutable_data<uint8_t>() + head_dim_);
-  }
-
-  for (int shard = 0; shard < output_tensors.size(); shard++) {
-    for (int index = 0; index < output_tensors[shard].size(); index++) {
-      ET_CHECK_MSG(
-          modules_[shard]->set_output(
-              kv_forward_name_, output_tensors[shard][index], index) ==
-              Error::Ok,
-          "failed to set output tensor for module %d's %d'th output "
-          "while updating kv_cache output tensors",
-          shard,
-          index);
-    }
-  }
-
-  auto& k_cache_in = k_cache_in_[kv_forward_name_];
-  auto& k_cache_out = k_cache_out_[kv_forward_name_];
-  // update k_cache by single thread, this part is cpu cache sensitive
-  for (int i = 0; i < k_cache_in.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
-         ++j, offset += kv_cache_len_) {
-      ptr_in[offset] = ptr_out[j];
-    }
-    k_cache_in[i]->set_data(ptr_in + 1);
-  }
-}
-
-void ShiftPointerIoMgr::update_prefill_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  (void)cur_token;
-  (void)output_tensors;
-
-  if (!is_bert()) {
-    // update v_cache
-    auto& v_cache_in = v_cache_in_[prefill_forward_name_];
-    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
-    for (int i = 0; i < v_cache_in.size(); i++) {
-      v_cache_in[i]->set_data(
-          v_cache_in[i]->mutable_data<uint8_t>() + prefill_ar_len_ * head_dim_);
-      v_cache_out[i]->set_data(
-          v_cache_out[i]->mutable_data<uint8_t>() +
-          prefill_ar_len_ * head_dim_);
-    }
-
-    for (int shard = 0; shard < output_tensors.size(); shard++) {
-      for (int index = 0; index < output_tensors[shard].size(); index++) {
-        ET_CHECK_MSG(
-            modules_[shard]->set_output(
-                prefill_forward_name_, output_tensors[shard][index], index) ==
-                Error::Ok,
-            "failed to set output tensor for module %d's %d'th output "
-            "while updating kv_cache output tensors",
-            shard,
-            index);
-      }
-    }
-
-    auto& k_cache_in = k_cache_in_[prefill_forward_name_];
-    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
-    // update k_cache by single thread, this part is cpu cache sensitive
-    for (int i = 0; i < k_cache_in.size(); ++i) {
-      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-      for (size_t j = 0, offset = prefill_cache_len_; j < head_dim_;
-           ++j, offset += prefill_cache_len_) {
-        for (int k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_;
-             k++) {
-          ptr_in[offset + k] = ptr_out[k_stride + k];
-        }
-      }
-      k_cache_in[i]->set_data(ptr_in + prefill_ar_len_);
-    }
-  }
-}
-
-void ShiftPointerIoMgr::fill_prefill_toks(
-    int64_t start_pos,
-    std::vector<uint64_t>& prompt_tokens) {
-  IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  for (int i = 0; i < prefill_ar_len_; i++) {
-    if (!is_bert()) {
-      ptr->prefill_input_pos[i] = start_pos + i;
-    }
-
-    if (start_pos + i < prompt_tokens.size()) {
-      // Support CPU 4-bit embedding, which requires int64 input.
-      // However, for QNN embedding, only int32 input is needed.
-      // Therefore, we need to cast to the correct type to write the data.
-      if (use_int64_token_) {
-        ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i];
-      } else {
-        int32_t* prefill_input_toks_ptr =
-            reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
-        prefill_input_toks_ptr[i] =
-            static_cast<int32_t>(prompt_tokens[start_pos + i]);
-      }
-    }
-    if (start_pos >= prefill_ar_len_) {
-      for (int j = 0,
-               offset = i * context_len_ +
-               (context_len_ - prefill_ar_len_ - start_pos);
-           j < prefill_ar_len_;
-           ++j) {
-        ptr->prefill_attention_mask[offset + j] = 65535;
-      }
-    }
-  }
-}
-
-void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
-  IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  ptr->kv_input_pos = static_cast<int32_t>(pos);
-  ;
-  ptr->kv_attention_mask[kv_cache_len_] = 65535;
-}
-
-SmartMaskIoMgr::SmartMaskIoMgr(
-    std::vector<std::shared_ptr<Module>>& modules,
-    int32_t context_len,
-    int32_t prefill_ar_len,
-    int32_t prefill_cache_len,
-    int32_t kv_ar_len,
-    int32_t kv_cache_len,
-    int32_t vocab_size,
-    int32_t num_layers,
-    int32_t head_dim,
-    int32_t num_heads,
-    EvalMode eval_mode,
-    const std::string& prefill_forward_name,
-    const std::string& kv_forward_name,
-    const bool use_int64_token)
-    : IoMgrBase(modules),
-      shard_layers_({num_layers}),
-      context_len_(context_len),
-      kv_ar_len_(kv_ar_len),
-      kv_cache_len_(kv_cache_len),
-      prefill_ar_len_(prefill_ar_len),
-      prefill_cache_len_(prefill_cache_len),
-      vocab_size_(vocab_size),
-      num_layers_(num_layers),
-      head_dim_(head_dim),
-      num_heads_(num_heads),
-      eval_mode_(eval_mode),
-      prefill_forward_name_(prefill_forward_name),
-      kv_forward_name_(kv_forward_name),
-      use_int64_token_(use_int64_token) {
-  if (!prefill_forward_name_.empty()) {
-    input_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-  if (!kv_forward_name_.empty()) {
-    input_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-
-  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
-      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
-}
-
-std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_elements() {
-  int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_);
-  size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * kv_cache_len_;
-  size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_ * max_ar_len;
-  return std::unordered_map<std::string, size_t>{
-      {"kv_input_toks_ele", kv_ar_len_},
-      {"kv_input_pos_ele", kv_ar_len_},
-      {"cache_in_ele", cache_in_ele},
-      {"cache_out_ele", cache_out_ele},
-      {"kv_attention_mask_ele", kv_ar_len_ * context_len_},
-      {"kv_logits_ele", kv_ar_len_ * vocab_size_},
-      {"prefill_input_toks_ele", prefill_ar_len_},
-      {"prefill_input_pos_ele", prefill_ar_len_},
-      {"prefill_attention_mask_ele", prefill_ar_len_ * context_len_},
-      {"prefill_logits_ele", prefill_ar_len_ * vocab_size_}};
-}
-
-std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_bytes() {
-  std::unordered_map<std::string, size_t> element_map = get_io_elements();
-  auto align = [](size_t byte) {
-    size_t alignment = MemoryAllocator::kDefaultAlignment;
-    return byte % alignment == 0 ? byte
-                                 : byte +
-            (static_cast<intptr_t>(alignment) -
-             byte % static_cast<intptr_t>(alignment));
-  };
-  return std::unordered_map<std::string, size_t>{
-      {"kv_input_toks_bytes",
-       align(element_map["kv_input_toks_ele"] * sizeof(int32_t))},
-      {"kv_input_pos_bytes",
-       align(element_map["kv_input_pos_ele"] * sizeof(int32_t))},
-      {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))},
-      {"cache_out_bytes",
-       align(element_map["cache_out_ele"] * sizeof(uint8_t))},
-      {"kv_attention_mask_bytes",
-       align(element_map["kv_attention_mask_ele"] * sizeof(uint16_t))},
-      {"kv_logits_bytes",
-       align(element_map["kv_logits_ele"] * sizeof(uint16_t))},
-      {"prefill_input_toks_bytes",
-       align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))},
-      {"prefill_input_pos_bytes",
-       align(element_map["prefill_input_pos_ele"] * sizeof(int32_t))},
-      {"prefill_attention_mask_bytes",
-       align(element_map["prefill_attention_mask_ele"] * sizeof(uint16_t))},
-      {"prefill_logits_bytes",
-       align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}};
-}
-
-void SmartMaskIoMgr::IO::init_io_ptrs(
-    void* shared_buffer_ptr,
-    std::unordered_map<std::string, size_t>& io_bytes_map) {
-  shared_buffer_base = shared_buffer_ptr;
-  std::byte* cur_ptr = reinterpret_cast<std::byte*>(shared_buffer_base);
-  std::size_t cur_pos = 0;
-  size_t layered_head_count = num_layers_ * num_heads_;
-
-  // Iterate map so that we don't need to care about which mode is used.
-  for (const auto& iter : io_bytes_map) {
-    std::string key = iter.first;
-    size_t size = iter.second;
-    if (key == "kv_input_toks_bytes") {
-      kv_input_toks = reinterpret_cast<int64_t*>(cur_ptr);
-    } else if (key == "kv_input_pos_bytes") {
-      kv_input_pos = reinterpret_cast<int32_t*>(cur_ptr);
-    } else if (key == "cache_in_bytes" || key == "cache_out_bytes") {
-      auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out;
-      auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out;
-      size_t single_head_size = size / layered_head_count;
-      k_cache_ref.reserve(num_layers_);
-      v_cache_ref.reserve(num_layers_);
-      for (int i = 0; i < num_layers_; ++i) {
-        k_cache_ref[i].reserve(num_heads_);
-        v_cache_ref[i].reserve(num_heads_);
-        for (int j = 0; j < num_heads_; ++j) {
-          k_cache_ref[i][j] = reinterpret_cast<uint8_t*>(cur_ptr);
-          io_pos_map[cur_ptr] = cur_pos;
-          cur_ptr += single_head_size;
-          cur_pos += single_head_size;
-          v_cache_ref[i][j] = reinterpret_cast<uint8_t*>(cur_ptr);
-          io_pos_map[cur_ptr] = cur_pos;
-          cur_ptr += single_head_size;
-          cur_pos += single_head_size;
-        }
-      }
-      continue;
-    } else if (key == "kv_attention_mask_bytes") {
-      kv_attention_mask = reinterpret_cast<uint16_t*>(cur_ptr);
-    } else if (key == "kv_logits_bytes") {
-      kv_logits = reinterpret_cast<uint16_t*>(cur_ptr);
-    } else if (key == "prefill_input_toks_bytes") {
-      prefill_input_toks = reinterpret_cast<int64_t*>(cur_ptr);
-    } else if (key == "prefill_input_pos_bytes") {
-      prefill_input_pos = reinterpret_cast<int32_t*>(cur_ptr);
-    } else if (key == "prefill_attention_mask_bytes") {
-      prefill_attention_mask = reinterpret_cast<uint16_t*>(cur_ptr);
-    } else if (key == "prefill_logits_bytes") {
-      prefill_logits = reinterpret_cast<uint16_t*>(cur_ptr);
-    } else {
-      ET_LOG(Error, "Unknown pointer type: %s", key.c_str());
-    }
-
-    io_pos_map[cur_ptr] = cur_pos;
-    cur_ptr += size;
-    cur_pos += size;
-  }
-}
-
-void SmartMaskIoMgr::IO::add_custom_mem_info(
-    void* ptr,
-    size_t nbytes,
-    executorch::aten::ScalarType scalar_type,
-    executorch::runtime::TensorInfo& tensor_info) {
-  if (auto it = io_pos_map.find(static_cast<std::byte*>(ptr));
-      it == io_pos_map.end()) {
-    ET_LOG(Error, "Shared buffer pointer %p is not found", ptr);
-  }
-  size_t pos = io_pos_map[static_cast<std::byte*>(ptr)];
-  uint32_t rank = tensor_info.sizes().size();
-  uint32_t shape[rank];
-  CustomMemTensorInfo info = {
-      shared_buffer_base, ptr, pos, nbytes, shape, rank, scalar_type};
-  QnnExecuTorchAddCustomMemTensorInfo(info);
-}
-
-void SmartMaskIoMgr::init_io() {
-  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
-
-  switch (eval_mode_) {
-    case EvalMode::kKVCached:
-      io_bytes_map.erase("prefill_input_toks_bytes");
-      io_bytes_map.erase("prefill_input_pos_bytes");
-      io_bytes_map.erase("prefill_attention_mask_bytes");
-      io_bytes_map.erase("prefill_logits_bytes");
-      break;
-    case EvalMode::kHybrid:
-      break;
-    default:
-      break;
-  }
-
-  size_t total_bytes = 0;
-  for (const auto& iter : io_bytes_map) {
-    size_t size = iter.second;
-    if (iter.first == "cache_in_bytes" || iter.first == "cache_out_bytes") {
-      size = iter.second * 2;
-    }
-    total_bytes += size;
-  }
-  void* shared_ptr = QnnExecuTorchAllocCustomMem(
-      total_bytes, MemoryAllocator::kDefaultAlignment);
-
-  ET_CHECK_MSG(
-      shared_ptr,
-      "Allocate Rpc mem falied, bytes=%zu, alignment=%zu",
-      total_bytes,
-      MemoryAllocator::kDefaultAlignment);
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  ptr->num_heads_ = num_heads_;
-  ptr->num_layers_ = num_layers_;
-  ptr->head_dim_ = head_dim_;
-  ptr->init_io_ptrs(shared_ptr, io_bytes_map);
-}
-
-void SmartMaskIoMgr::reset_io(
-    const std::vector<executorch::runtime::Result<
-        executorch::runtime::MethodMeta>>& prefill_methods_meta,
-    const std::vector<
-        executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-        kv_methods_meta) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  int32_t prefill_attn_size = prefill_ar_len_ * context_len_;
-  int32_t kv_attn_size = kv_ar_len_ * context_len_;
-  std::fill(
-      ptr->prefill_attention_mask,
-      ptr->prefill_attention_mask + prefill_attn_size,
-      0);
-  std::fill(ptr->kv_attention_mask, ptr->kv_attention_mask + kv_attn_size, 0);
-}
-
-void SmartMaskIoMgr::prepare_kv_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
-
-  // [I]: input_tokens
-  Result<TensorInfo> kv_input_toks = methods_meta[0]->input_tensor_meta(0);
-  kv_input_toks_ = std::make_unique<TensorImpl>(
-      kv_input_toks->scalar_type(),
-      kv_input_toks->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_input_toks->sizes().data()),
-      ptr->kv_input_toks,
-      const_cast<TensorImpl::DimOrderType*>(kv_input_toks->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get());
-  ptr->add_custom_mem_info(
-      ptr->kv_input_toks,
-      io_bytes_map["kv_input_toks_bytes"],
-      kv_input_toks->scalar_type(),
-      kv_input_toks.get());
-
-  // [I]: atten_mask
-  std::fill_n(ptr->kv_attention_mask, kv_ar_len_ * context_len_, 0);
-  Result<TensorInfo> kv_attention_mask = methods_meta[0]->input_tensor_meta(1);
-  kv_attention_mask_ = std::make_unique<TensorImpl>(
-      kv_attention_mask->scalar_type(),
-      kv_attention_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_attention_mask->sizes().data()),
-      ptr->kv_attention_mask,
-      const_cast<TensorImpl::DimOrderType*>(
-          kv_attention_mask->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get());
-  ptr->add_custom_mem_info(
-      ptr->kv_attention_mask,
-      io_bytes_map["kv_attention_mask_bytes"],
-      kv_attention_mask->scalar_type(),
-      kv_attention_mask.get());
-
-  // [I]: input_pos
-  Result<TensorInfo> kv_input_pos = methods_meta[0]->input_tensor_meta(2);
-  kv_input_pos_ = std::make_unique<TensorImpl>(
-      kv_input_pos->scalar_type(),
-      kv_input_pos->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(kv_input_pos->sizes().data()),
-      ptr->kv_input_pos,
-      const_cast<TensorImpl::DimOrderType*>(kv_input_pos->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get());
-  ptr->add_custom_mem_info(
-      ptr->kv_input_pos,
-      io_bytes_map["kv_input_pos_bytes"],
-      kv_input_pos->scalar_type(),
-      kv_input_pos.get());
-
-  // [I] kv_cache
-  size_t layered_head_count = num_layers_ * num_heads_;
-  int index = 3; // bypass input_tokens, atten_mask, input_pos
-  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->input_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
-                                : v_cache_in_[kv_forward_name_]);
-          uint8_t* cache_ptr = (cache_group == 0)
-              ? ptr->k_cache[layer + offset][head]
-              : ptr->v_cache[layer + offset][head];
-
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          ptr->add_custom_mem_info(
-              cache_ptr,
-              io_bytes_map["cache_in_bytes"] / layered_head_count,
-              kv_cache->scalar_type(),
-              kv_cache.get());
-          input_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  kv_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->kv_logits,
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-
-  ptr->add_custom_mem_info(
-      ptr->kv_logits,
-      io_bytes_map["kv_logits_bytes"],
-      logits->scalar_type(),
-      logits.get());
-  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
-      kv_logits_.get());
-
-  // [O] kv_cache
-  index = 1;
-  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
-                                : v_cache_out_[kv_forward_name_]);
-          uint8_t* cache_ptr = (cache_group == 0)
-              ? ptr->k_cache_out[layer + offset][head]
-              : ptr->v_cache_out[layer + offset][head];
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          ptr->add_custom_mem_info(
-              cache_ptr,
-              io_bytes_map["cache_out_bytes"] / layered_head_count,
-              kv_cache->scalar_type(),
-              kv_cache.get());
-          output_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void SmartMaskIoMgr::update_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  // update input_tok
-  *ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  // update position_ids
-  *ptr->kv_input_pos = static_cast<int32_t>(pos);
-  // update smart mask for previous cache
-  ptr->kv_attention_mask[pos] = 65535;
-
-  // update v_cache
-  auto& v_cache_in = v_cache_in_[kv_forward_name_];
-  auto& v_cache_out = v_cache_out_[kv_forward_name_];
-  // update v_cache by single thread, this part is cpu cache sensitive
-  for (int i = 0; i < v_cache_in.size(); ++i) {
-    uint8_t* ptr_in = v_cache_in[i]->mutable_data<uint8_t>() + pos * head_dim_;
-    const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
-    memcpy(ptr_in, ptr_out, head_dim_ * sizeof(uint8_t));
-  }
-
-  auto& k_cache_in = k_cache_in_[kv_forward_name_];
-  auto& k_cache_out = k_cache_out_[kv_forward_name_];
-  for (int i = 0; i < k_cache_in.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>() + pos;
-    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = 0; j < head_dim_;
-         ++j, offset += kv_cache_len_) {
-      ptr_in[offset] = ptr_out[j];
-    }
-  }
-}
-
-void SmartMaskIoMgr::prepare_prefill_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(
-      !(prefill_forward_name_.empty()), "prefill forward name is empty");
-
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
-
-  // [I]: pre_input_tokens
-  Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
-  prefill_input_toks_ = std::make_unique<TensorImpl>(
-      prefill_input_toks->scalar_type(),
-      prefill_input_toks->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_input_toks->sizes().data()),
-      ptr->prefill_input_toks,
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_input_toks->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
-  ptr->add_custom_mem_info(
-      ptr->prefill_input_toks,
-      io_bytes_map["prefill_input_toks_bytes"],
-      executorch::aten::ScalarType::Int,
-      prefill_input_toks.get());
-
-  // [I]: prefill_attention_mask
-  for (int i = 0; i < prefill_ar_len_; ++i) {
-    for (int j = 0,
-             offset = i * context_len_ + (context_len_ - prefill_ar_len_);
-         j < prefill_ar_len_;
-         ++j) {
-      if (i < j) {
-        ptr->prefill_attention_mask[j + offset] = 0;
-      } else {
-        ptr->prefill_attention_mask[j + offset] = 65535;
-      }
-    }
-  }
-  Result<TensorInfo> prefill_attention_mask =
-      methods_meta[0]->input_tensor_meta(1);
-  prefill_attention_mask_ = std::make_unique<TensorImpl>(
-      prefill_attention_mask->scalar_type(),
-      prefill_attention_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(
-          prefill_attention_mask->sizes().data()),
-      ptr->prefill_attention_mask,
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_attention_mask->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(
-      prefill_attention_mask_.get());
-  ptr->add_custom_mem_info(
-      ptr->prefill_attention_mask,
-      io_bytes_map["prefill_attention_mask_bytes"],
-      executorch::aten::ScalarType::Bits16,
-      prefill_attention_mask.get());
-
-  if (!is_bert()) {
-    // [I]: prefill_input_pos
-    Result<TensorInfo> prefill_input_pos =
-        methods_meta[0]->input_tensor_meta(2);
-    prefill_input_pos_ = std::make_unique<TensorImpl>(
-        prefill_input_pos->scalar_type(),
-        prefill_input_pos->sizes().size(),
-        const_cast<TensorImpl::SizesType*>(prefill_input_pos->sizes().data()),
-        ptr->prefill_input_pos,
-        const_cast<TensorImpl::DimOrderType*>(
-            prefill_input_pos->dim_order().data()));
-    input_tensors_[prefill_forward_name_][0].push_back(
-        prefill_input_pos_.get());
-    ptr->add_custom_mem_info(
-        ptr->prefill_input_pos,
-        io_bytes_map["prefill_input_pos_bytes"],
-        prefill_input_pos->scalar_type(),
-        prefill_input_pos.get());
-
-    // [I] kv_cache
-    size_t layered_head_count = num_layers_ * num_heads_;
-    int index = 3; // bypass input_tokens, atten_mask, input_pos
-    for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-         offset += shard_layers_[shard_index], shard_index++) {
-      for (int cache_group = 0; cache_group < 2; ++cache_group) {
-        for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-          for (int head = 0; head < num_heads_; ++head, ++index) {
-            Result<TensorInfo> kv_cache =
-                methods_meta[shard_index]->input_tensor_meta(index);
-            std::vector<std::unique_ptr<TensorImpl>>& cache =
-                (cache_group == 0 ? k_cache_in_[prefill_forward_name_]
-                                  : v_cache_in_[prefill_forward_name_]);
-            uint8_t* cache_ptr = (cache_group == 0)
-                ? ptr->k_cache[layer + offset][head]
-                : ptr->v_cache[layer + offset][head];
-
-            cache.emplace_back(std::make_unique<TensorImpl>(
-                kv_cache->scalar_type(),
-                kv_cache->sizes().size(),
-                const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-                cache_ptr,
-                const_cast<TensorImpl::DimOrderType*>(
-                    kv_cache->dim_order().data())));
-            ptr->add_custom_mem_info(
-                cache_ptr,
-                io_bytes_map["cache_in_bytes"] / layered_head_count,
-                kv_cache->scalar_type(),
-                kv_cache.get());
-            input_tensors_[prefill_forward_name_][shard_index].push_back(
-                cache.back().get());
-          }
-        }
-      }
-    }
-  }
-
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(logit_index);
-  prefill_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->prefill_logits,
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
-      prefill_logits_.get());
-  ptr->add_custom_mem_info(
-      ptr->prefill_logits,
-      io_bytes_map["prefill_logits_bytes"],
-      executorch::aten::ScalarType::Bits16,
-      logits.get());
-
-  // [O] kv_cache
-  int index = 1;
-  size_t layered_head_count = num_layers_ * num_heads_;
-  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
-                                : v_cache_out_[prefill_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? ptr->k_cache_out[layer + offset][head]
-              : ptr->v_cache_out[layer + offset][head];
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          ptr->add_custom_mem_info(
-              cache_ptr,
-              io_bytes_map["cache_out_bytes"] / layered_head_count,
-              executorch::aten::ScalarType::Byte,
-              kv_cache.get());
-          output_tensors_[prefill_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void SmartMaskIoMgr::update_prefill_to_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  *ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  *ptr->kv_input_pos = static_cast<int32_t>(pos);
-  // pos means the cur_token pos
-  for (int i = 0; i < pos; i++) {
-    ptr->kv_attention_mask[i] = 65535;
-  }
-
-  if (is_bert()) {
-    // update v_cache
-    auto& v_cache_in = v_cache_in_[kv_forward_name_];
-    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
-    // update v_cache by single thread, this part is cpu cache sensitive
-    size_t copied_size = kv_cache_len_ * head_dim_ * sizeof(uint8_t);
-    for (int i = 0; i < v_cache_in.size(); ++i) {
-      uint8_t* ptr_in = v_cache_in[i]->mutable_data<uint8_t>();
-      const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
-      memcpy(ptr_in, ptr_out, copied_size);
-    }
-
-    auto& k_cache_in = k_cache_in_[kv_forward_name_];
-    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
-    for (int i = 0; i < k_cache_in.size(); ++i) {
-      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-      for (size_t j = 0, offset = 0; j < head_dim_;
-           ++j, offset += kv_cache_len_) {
-        for (size_t k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) {
-          ptr_in[offset + k] = ptr_out[k_stride + k];
-        }
-      }
-    }
-  } else {
-    // Update K is enough, copy from last to prevent from overwriting values
-    size_t copied_size = pos * sizeof(uint8_t);
-    for (int l = 0; l < num_layers_; l++) {
-      for (int h = 0; h < num_heads_; h++) {
-        uint8_t* k_cache = ptr->k_cache[l][h];
-        for (int hd = head_dim_ - 1; hd > -1; hd--) {
-          memcpy(
-              k_cache + (kv_cache_len_ * hd),
-              k_cache + (prefill_cache_len_ * hd),
-              copied_size);
-        }
-      }
-    }
-  }
-}
-
-void SmartMaskIoMgr::update_prefill_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  (void)output_tensors;
-
-  if (!is_bert()) {
-    // update v_cache
-    auto& v_cache_in = v_cache_in_[prefill_forward_name_];
-    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
-    // update v_cache by single thread, this part is cpu cache sensitive
-    size_t copied_size = prefill_ar_len_ * head_dim_ * sizeof(uint8_t);
-    for (int i = 0; i < v_cache_in.size(); ++i) {
-      uint8_t* ptr_in =
-          v_cache_in[i]->mutable_data<uint8_t>() + pos * head_dim_;
-      const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
-      memcpy(ptr_in, ptr_out, copied_size);
-    }
-
-    auto& k_cache_in = k_cache_in_[prefill_forward_name_];
-    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
-    for (int i = 0; i < k_cache_in.size(); ++i) {
-      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-      for (size_t j = 0, offset = pos; j < head_dim_;
-           ++j, offset += prefill_cache_len_) {
-        for (size_t k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_;
-             k++) {
-          ptr_in[offset + k] = ptr_out[k_stride + k];
-        }
-      }
-    }
-  }
-}
-
-void SmartMaskIoMgr::fill_prefill_toks(
-    int64_t start_pos,
-    std::vector<uint64_t>& prompt_tokens) {
-  IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  for (int i = 0; i < prefill_ar_len_; i++) {
-    if (!is_bert()) {
-      ptr->prefill_input_pos[i] = start_pos + i;
-    }
-
-    if (start_pos + i < prompt_tokens.size()) {
-      // Support CPU 4-bit embedding, which requires int64 input.
-      // However, for QNN embedding, only int32 input is needed.
-      // Therefore, we need to cast to the correct type to write the data.
-      if (use_int64_token_) {
-        ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i];
-      } else {
-        int32_t* prefill_input_toks_ptr =
-            reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
-        prefill_input_toks_ptr[i] =
-            static_cast<int32_t>(prompt_tokens[start_pos + i]);
-      }
-    }
-    if (start_pos >= prefill_ar_len_) {
-      for (int j = 0, offset = i * context_len_ + (start_pos - prefill_ar_len_);
-           j < prefill_ar_len_;
-           ++j) {
-        ptr->prefill_attention_mask[offset + j] = 65535;
-      }
-    }
-  }
-}
-
-void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
-  IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  *ptr->kv_input_toks =
-      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  ptr->kv_attention_mask[kv_cache_len_] = 65535;
-}
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
deleted file mode 100644
index 0f10eef8ddc..00000000000
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <future>
-#include <limits>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/runtime/executor/method_meta.h>
-
-namespace example {
-
-enum EvalMode {
-  kKVCached = 0,
-  kHybrid,
-  kUnsupported,
-};
-class IoMgrBase {
- public:
-  IoMgrBase(
-      std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
-  virtual ~IoMgrBase();
-  virtual void init_io() = 0;
-  virtual void reset_io(
-      const std::vector<executorch::runtime::Result<
-          executorch::runtime::MethodMeta>>& prefill_methods_meta,
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          kv_methods_meta) = 0;
-  virtual void prepare_prefill_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) = 0;
-  virtual void prepare_kv_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) = 0;
-  virtual void fill_prefill_toks(
-      int64_t start_pos,
-      std::vector<uint64_t>& prompt_tokens) = 0;
-  virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0;
-  virtual void update_prefill_to_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  virtual void update_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  virtual void update_prefill_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  void* get_mutable_ptr();
-  std::vector<executorch::aten::Tensor> get_input_tensors(
-      int shard_index,
-      const std::string& method_name);
-  std::vector<executorch::aten::Tensor> get_output_tensors(
-      int shard_index,
-      const std::string& method_name);
-
- protected:
-  std::unique_ptr<void, void (*)(void*)> data_ptr_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::vector<executorch::aten::TensorImpl*>>>
-      input_tensors_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::vector<executorch::aten::TensorImpl*>>>
-      output_tensors_;
-  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
-};
-
-class ShiftPointerIoMgr : public IoMgrBase {
- public:
-  ShiftPointerIoMgr(
-      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
-      int32_t context_len,
-      int32_t prefill_ar_len,
-      int32_t prefill_cache_len,
-      int32_t kv_ar_len,
-      int32_t kv_cache_len,
-      int32_t vocab_size,
-      int32_t num_layers,
-      int32_t head_dim,
-      int32_t num_heads,
-      EvalMode eval_mode,
-      const std::string& prefill_forward_name,
-      const std::string& kv_forward_name,
-      const bool use_int64_token);
-
-  void init_io() override;
-  void reset_io(
-      const std::vector<executorch::runtime::Result<
-          executorch::runtime::MethodMeta>>& prefill_methods_meta,
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          kv_methods_meta) override;
-  void prepare_prefill_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void prepare_kv_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void fill_prefill_toks(
-      int64_t start_pos,
-      std::vector<uint64_t>& prompt_tokens) override;
-  void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
-  void update_prefill_to_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_prefill_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  struct IO {
-    int64_t kv_input_toks;
-    int32_t kv_input_pos;
-    std::vector<std::vector<std::vector<uint8_t>>> k_cache;
-    std::vector<std::vector<uint8_t>> v_cache;
-    std::vector<std::vector<uint8_t>> k_cache_out;
-    std::vector<uint16_t> kv_attention_mask;
-    std::vector<uint16_t> kv_logits;
-    std::vector<int64_t> prefill_input_toks;
-    std::vector<int32_t> prefill_input_pos;
-    std::vector<uint16_t> prefill_attention_mask;
-    std::vector<uint16_t> prefill_logits;
-  };
-
- private:
-  // If the cache length is zero, it indicates a BERT model, which does not use
-  // position ids or KV cache inputs.
-  bool is_bert() const {
-    return prefill_cache_len_ == 0;
-  }
-  std::unique_ptr<executorch::aten::TensorImpl> kv_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_attention_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_attention_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_out_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_out_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
-  std::vector<int> shard_layers_;
-  int32_t context_len_{0};
-  int32_t kv_ar_len_{0};
-  int32_t kv_cache_len_{0};
-  int32_t prefill_ar_len_{0};
-  int32_t prefill_cache_len_{0};
-  int32_t vocab_size_;
-  int32_t num_layers_;
-  int32_t head_dim_;
-  int32_t num_heads_;
-  EvalMode eval_mode_;
-  std::string prefill_forward_name_;
-  std::string kv_forward_name_;
-  const bool use_int64_token_{false};
-};
-
-class SmartMaskIoMgr : public IoMgrBase {
- public:
-  SmartMaskIoMgr(
-      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
-      int32_t context_len,
-      int32_t prefill_ar_len,
-      int32_t prefill_cache_len,
-      int32_t kv_ar_len,
-      int32_t kv_cache_len,
-      int32_t vocab_size,
-      int32_t num_layers,
-      int32_t head_dim,
-      int32_t num_heads,
-      EvalMode eval_mode,
-      const std::string& prefill_forward_name,
-      const std::string& kv_forward_name,
-      const bool use_int64_token);
-
-  void init_io() override;
-  void reset_io(
-      const std::vector<executorch::runtime::Result<
-          executorch::runtime::MethodMeta>>& prefill_methods_meta,
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          kv_methods_meta) override;
-  void prepare_prefill_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void prepare_kv_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void fill_prefill_toks(
-      int64_t start_pos,
-      std::vector<uint64_t>& prompt_tokens) override;
-  void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
-  void update_prefill_to_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_prefill_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-
-  std::unordered_map<std::string, size_t> get_io_elements();
-  std::unordered_map<std::string, size_t> get_io_bytes();
-
-  struct IO {
-    void* shared_buffer_base;
-    int64_t* kv_input_toks;
-    int32_t* kv_input_pos;
-    // layer -> head -> head_dim * seq_len
-    std::vector<std::vector<uint8_t*>> k_cache;
-    std::vector<std::vector<uint8_t*>> v_cache;
-    // layer -> head -> head_dim
-    std::vector<std::vector<uint8_t*>> k_cache_out;
-    std::vector<std::vector<uint8_t*>> v_cache_out;
-    // kv_ar_len_ * context_len_
-    uint16_t* kv_attention_mask;
-    // kv_ar_len_ * vocab_size
-    uint16_t* kv_logits;
-    // prefill_ar_len_
-    int64_t* prefill_input_toks;
-    int32_t* prefill_input_pos;
-    // prefill_ar_len_ * context_len_
-    uint16_t* prefill_attention_mask;
-    // vocab_size * prefill_ar_len_
-    uint16_t* prefill_logits;
-
-    size_t num_layers_;
-    size_t num_heads_;
-    size_t head_dim_;
-    std::unordered_map<std::byte*, size_t> io_pos_map;
-    ~IO() {
-      QnnExecuTorchFreeCustomMem(shared_buffer_base);
-    }
-    void init_io_ptrs(
-        void* shared_buffer_ptr,
-        std::unordered_map<std::string, size_t>& io_bytes_map);
-    void add_custom_mem_info(
-        void* ptr,
-        size_t nbytes,
-        executorch::aten::ScalarType scalar_type,
-        executorch::runtime::TensorInfo& tensor_info);
-  };
-
- private:
-  // If the cache length is zero, it indicates a BERT model, which does not use
-  // position ids or KV cache inputs.
-  bool is_bert() const {
-    return prefill_cache_len_ == 0;
-  }
-  std::unique_ptr<executorch::aten::TensorImpl> kv_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_attention_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_attention_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_out_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_out_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
-  std::vector<int> shard_layers_;
-  int32_t context_len_{0};
-  int32_t kv_ar_len_{0};
-  int32_t kv_cache_len_{0};
-  int32_t prefill_ar_len_{0};
-  int32_t prefill_cache_len_{0};
-  int32_t vocab_size_;
-  int32_t num_layers_;
-  int32_t head_dim_;
-  int32_t num_heads_;
-  EvalMode eval_mode_;
-  std::string prefill_forward_name_;
-  std::string kv_forward_name_;
-  const bool use_int64_token_{false};
-};
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
new file mode 100644
index 00000000000..ca155204dee
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
+#include <executorch/runtime/platform/assert.h>
+namespace example {
+KVManager::KVManager(KVManagerMode kv_updater, Metadata metadata)
+    : kv_updater_(kv_updater), metadata_(metadata) {
+  k_cache_.resize(
+      metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
+  v_cache_.resize(
+      metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
+
+  // Calculate cache size
+  switch (kv_updater_) {
+    case KVManagerMode::SMART_MASK: {
+      size_t cache_in_bytes = metadata_.num_layers * metadata_.num_heads *
+          metadata_.head_dim * metadata_.max_cache_len * sizeof(uint8_t);
+      size_t cache_out_bytes = metadata_.num_layers * metadata_.num_heads *
+          metadata_.head_dim * metadata_.max_ar_len * sizeof(uint8_t);
+      total_cache_size_ = 2 * (cache_in_bytes + cache_out_bytes);
+      break;
+    }
+    case KVManagerMode::SHIFT_POINTER: {
+      size_t k_cache_in_bytes = metadata_.num_layers * metadata_.num_heads *
+          (metadata_.head_dim + 1) * metadata_.max_cache_len * sizeof(uint8_t);
+      size_t k_cache_out_bytes = metadata_.num_layers * metadata_.num_heads *
+          metadata_.head_dim * metadata_.max_ar_len * sizeof(uint8_t);
+      // Use the same memory for input and output of value cache in shift
+      // pointer mode. Note that using context length to prevent exceeding the
+      // range when the AR-N model updates the last block in shift pointer
+      // mode.
+      size_t v_cache_bytes = metadata_.num_layers * (metadata_.num_heads + 1) *
+          metadata_.head_dim * metadata_.context_len * sizeof(uint8_t);
+      total_cache_size_ = k_cache_in_bytes + k_cache_out_bytes + v_cache_bytes;
+      break;
+    }
+    default:
+      break;
+  }
+};
+
+void KVManager::init_attention_mask(
+    uint16_t* attention_mask,
+    const std::vector<int32_t>& attention_map,
+    int32_t ar_len,
+    int32_t n_past) {
+  ET_CHECK_MSG(
+      attention_map.size() == ar_len,
+      "The size of attention_map (%zu) doesn't match with ar_len (%d)",
+      attention_map.size(),
+      ar_len);
+  uint16_t neg_val = 0;
+  uint16_t pos_val = 65535;
+  // Clear the attention mask
+  std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val);
+
+  // SMART_MASK requires special handling of attention mask
+  switch (kv_updater_) {
+    case KVManagerMode::SMART_MASK: {
+      uint16_t* past_ptr = attention_mask;
+      uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len);
+      // All inputs will necessarily attend to n_past and itself
+      for (int i = 0; i < ar_len; i++) {
+        // Iterate across ar_len
+        if (attention_map[i] < 0) {
+          // If negative, attend to only past tokens
+          std::fill_n(past_ptr, n_past, pos_val);
+        } else {
+          // If positive, copy attention map from (relative to 0th input) parent
+          // Parent token index
+          const int32_t pidx = attention_map[i];
+          uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len;
+          std::memcpy(
+              past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t));
+        }
+        // Attend to itself
+        new_ptr[i] = pos_val;
+        past_ptr += metadata_.context_len;
+        new_ptr += metadata_.context_len;
+      }
+      break;
+    }
+    case KVManagerMode::SHIFT_POINTER: {
+      // Only fill in ar_len. Rest will be padding
+      const size_t attn_row_start = metadata_.context_len - n_past - ar_len;
+      for (int i = 0; i < ar_len; i++) {
+        uint16_t* cur_ptr =
+            attention_mask + i * metadata_.context_len + attn_row_start;
+        // Attend to itself
+        cur_ptr[n_past + i] = pos_val;
+        if (attention_map[i] < 0) {
+          // If negative, attend to only past tokens
+          std::fill_n(cur_ptr, n_past, pos_val);
+        } else {
+          // If positive, copy attention map from (relative to 0th input) parent
+          // Parent token index
+          const int32_t pidx = attention_map[i];
+          uint16_t* parent_ptr =
+              attention_mask + pidx * metadata_.context_len + attn_row_start;
+          std::memcpy(
+              cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t));
+        }
+      }
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+void KVManager::update_attention_mask(
+    uint16_t* attention_mask,
+    int32_t ar_len,
+    int32_t n_past,
+    int32_t n_update) {
+  uint16_t pos_val = 65535;
+  uint16_t* cur_ptr = attention_mask;
+  if (kv_updater_ == KVManagerMode::SMART_MASK)
+    cur_ptr += n_past;
+  if (kv_updater_ == KVManagerMode::SHIFT_POINTER)
+    cur_ptr += metadata_.context_len - n_past - ar_len - n_update;
+
+  for (int i = 0; i < ar_len; i++) {
+    std::fill_n(cur_ptr, n_update, pos_val);
+    cur_ptr += metadata_.context_len;
+  }
+}
+
+void KVManager::init_cache(IMemAlloc* buffer_manager, int32_t ar_len) {
+  cur_ar_len_ = ar_len;
+  const size_t max_in_cache_block_in_bytes =
+      metadata_.max_cache_len * sizeof(uint8_t);
+  const size_t max_out_cache_block_in_bytes =
+      metadata_.max_ar_len * sizeof(uint8_t);
+
+  switch (kv_updater_) {
+    case KVManagerMode::SMART_MASK: {
+      const size_t cache_in_bytes =
+          metadata_.head_dim * max_in_cache_block_in_bytes;
+      const size_t cache_out_bytes =
+          metadata_.head_dim * max_out_cache_block_in_bytes;
+      for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+        for (int head = 0; head < metadata_.num_heads; ++head) {
+          // Allocate buffer for key cache and value cache
+          uint8_t* single_layer_k_cache_in = reinterpret_cast<uint8_t*>(
+              buffer_manager->allocate(cache_in_bytes));
+          uint8_t* single_layer_k_cache_out = reinterpret_cast<uint8_t*>(
+              buffer_manager->allocate(cache_out_bytes));
+          uint8_t* single_layer_v_cache_in = reinterpret_cast<uint8_t*>(
+              buffer_manager->allocate(cache_in_bytes));
+          uint8_t* single_layer_v_cache_out = reinterpret_cast<uint8_t*>(
+              buffer_manager->allocate(cache_out_bytes));
+
+          k_cache_[layer][head].buffer = single_layer_k_cache_in;
+          k_cache_[layer][head].output_buffer = single_layer_k_cache_out;
+          v_cache_[layer][head].buffer = single_layer_v_cache_in;
+          v_cache_[layer][head].output_buffer = single_layer_v_cache_out;
+        }
+      }
+      break;
+    }
+    case KVManagerMode::SHIFT_POINTER: {
+      const size_t k_cache_in_size_in_bytes = metadata_.num_heads *
+          (metadata_.head_dim + 1) * max_in_cache_block_in_bytes;
+      const size_t k_cache_out_size_in_bytes = metadata_.num_heads *
+          metadata_.head_dim * max_out_cache_block_in_bytes;
+      const size_t v_cache_size_in_bytes = (metadata_.num_heads + 1) *
+          metadata_.head_dim * metadata_.context_len * sizeof(uint8_t);
+      const int32_t single_head_size_in =
+          metadata_.head_dim * metadata_.max_cache_len;
+      const int32_t single_head_size_out =
+          metadata_.head_dim * metadata_.max_ar_len;
+      for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+        // Allocate buffer for key cache and value cache
+        uint8_t* single_layer_k_cache_in = reinterpret_cast<uint8_t*>(
+            buffer_manager->allocate(k_cache_in_size_in_bytes));
+        uint8_t* single_layer_k_cache_out = reinterpret_cast<uint8_t*>(
+            buffer_manager->allocate(k_cache_out_size_in_bytes));
+        // Note that using context length to prevent exceeding the range when
+        // the AR-N model updates the last block in shift pointer mode.
+        uint8_t* single_layer_v_cache = reinterpret_cast<uint8_t*>(
+            buffer_manager->allocate(v_cache_size_in_bytes));
+        for (int head = 0; head < metadata_.num_heads; ++head) {
+          k_cache_[layer][head].buffer = single_layer_k_cache_in +
+              head * (metadata_.head_dim + 1) * metadata_.max_cache_len;
+          k_cache_[layer][head].output_buffer =
+              single_layer_k_cache_out + head * single_head_size_out;
+          // v_cache:
+          // |cache_gap|h1_v_in_ptr|cache_len|h1_v_out_ptr|cache_gap|h2_v_in_ptr|cache_len|h2_v_out_ptr|...|
+          const int32_t cache_gap = (cur_ar_len_ == metadata_.context_len)
+              ? 0
+              : metadata_.max_cache_len - (metadata_.context_len - cur_ar_len_);
+          v_cache_[layer][head].buffer = single_layer_v_cache +
+              head * single_head_size_in + cache_gap * metadata_.head_dim;
+          v_cache_[layer][head].output_buffer =
+              single_layer_v_cache + (head + 1) * single_head_size_in;
+        }
+      }
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+void KVManager::rearrange_cache(int32_t ar_len_dst) {
+  // Don't need to rearrange if cur_ar_len_ is equal to target ar_len
+  if (cur_ar_len_ == ar_len_dst)
+    return;
+  for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+    for (int head = 0; head < metadata_.num_heads; ++head) {
+      rearrange_key(k_cache_[layer][head], ar_len_dst);
+      rearrange_value(v_cache_[layer][head], ar_len_dst);
+    }
+  }
+  // rearrange done.
+  cur_ar_len_ = ar_len_dst;
+}
+
+void KVManager::rearrange_key(KVCache& k_cache, int32_t ar_len_dst) {
+  // The output of key cache doesn't need to rearrange for both of SMART_MASK
+  // and SHIFT_POINTER
+  const int32_t src_cache_num = (cur_ar_len_ == metadata_.context_len)
+      ? metadata_.context_len
+      : metadata_.context_len - cur_ar_len_;
+  const int32_t dst_cache_num = metadata_.context_len - ar_len_dst;
+  uint8_t* k_cache_in_read_ptr = k_cache.buffer;
+  uint8_t* k_cache_in_write_ptr = k_cache.buffer;
+
+  if (src_cache_num > dst_cache_num) {
+    if (kv_updater_ == KVManagerMode::SHIFT_POINTER) {
+      // Left padded KV$
+      k_cache_in_read_ptr += src_cache_num;
+      k_cache_in_write_ptr += dst_cache_num;
+    }
+    // copy from first dimension
+    for (int i = 0; i < metadata_.head_dim; i++) {
+      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num);
+      k_cache_in_read_ptr += src_cache_num;
+      k_cache_in_write_ptr += dst_cache_num;
+    }
+  } else {
+    k_cache_in_read_ptr += (metadata_.head_dim - 1) * src_cache_num;
+    k_cache_in_write_ptr += (metadata_.head_dim - 1) * dst_cache_num;
+    if (kv_updater_ == KVManagerMode::SHIFT_POINTER) {
+      k_cache_in_read_ptr += src_cache_num;
+      k_cache_in_write_ptr += dst_cache_num;
+    }
+    // copy from last dimension
+    for (int i = 0; i < metadata_.head_dim; i++) {
+      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num);
+      k_cache_in_read_ptr -= src_cache_num;
+      k_cache_in_write_ptr -= dst_cache_num;
+    }
+  }
+}
+
+void KVManager::rearrange_value(KVCache& v_cache, int32_t ar_len_dst) {
+  // The input and output of the value cache don't need to rearrange for both
+  // SMART_MASK and SHIFT_POINTER. However, the input pointer of the value cache
+  // needs to be reset by ar_len_dst in SHIFT_POINTER mode. The output pointer
+  // of the value cache remains unchanged regardless of ar_len.
+  const int32_t ar_gap = (cur_ar_len_ == metadata_.context_len)
+      ? ar_len_dst
+      : ar_len_dst - cur_ar_len_;
+  if (kv_updater_ == KVManagerMode::SHIFT_POINTER) {
+    v_cache.buffer = v_cache.buffer + ar_gap * metadata_.head_dim;
+  }
+}
+
+bool KVManager::update_cache_tensor(
+    std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+        k_cache_in,
+    std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+        k_cache_out,
+    std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+        v_cache_in,
+    std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+        v_cache_out,
+    int32_t ar_len,
+    int32_t n_past) {
+  ET_CHECK_MSG(
+      cur_ar_len_ == ar_len,
+      "Current AR length (%d) is not matched with target AR length (%d). Please rearrange cache first.",
+      cur_ar_len_,
+      ar_len);
+  bool updated = false;
+  // Data pointer in the tensors need to update only for SHIFT_POINTER mode
+  // The BERT model does not update the cache tensor because it does not use KV
+  // cache inputs.
+  if (kv_updater_ == KVManagerMode::SHIFT_POINTER &&
+      metadata_.context_len != cur_ar_len_) {
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      for (int head = 0; head < metadata_.num_heads; ++head) {
+        k_cache_in[layer][head]->set_data(
+            k_cache_[layer][head].buffer + n_past);
+        v_cache_in[layer][head]->set_data(
+            v_cache_[layer][head].buffer + n_past * metadata_.head_dim);
+        v_cache_out[layer][head]->set_data(
+            v_cache_[layer][head].output_buffer + n_past * metadata_.head_dim);
+      }
+    }
+    updated = true;
+  }
+  return updated;
+}
+
+void KVManager::update_cache(int32_t ar_len, int32_t n_past, int32_t n_update) {
+  ET_CHECK_MSG(
+      cur_ar_len_ == ar_len,
+      "Current AR length (%d) is not matched with target AR length (%d). Please rearrange cache first.",
+      cur_ar_len_,
+      ar_len);
+  for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+    for (int head = 0; head < metadata_.num_heads; ++head) {
+      update_key(k_cache_[layer][head], n_past, n_update);
+      update_value(v_cache_[layer][head], n_past, n_update);
+    }
+  }
+}
+
+void KVManager::update_key(KVCache& k_cache, int32_t n_past, int32_t n_update) {
+  uint8_t* write_ptr = k_cache.buffer;
+  uint8_t* read_ptr = k_cache.output_buffer;
+  const int32_t copy_size = n_update * sizeof(uint8_t);
+  const int32_t iter_size = (cur_ar_len_ == metadata_.context_len)
+      ? metadata_.context_len
+      : metadata_.context_len - cur_ar_len_;
+  const int32_t out_size = cur_ar_len_;
+  const int32_t past_size = n_past;
+  const int32_t n_iter = metadata_.head_dim;
+
+  if (kv_updater_ == KVManagerMode::SHIFT_POINTER)
+    write_ptr += iter_size + past_size;
+  if (kv_updater_ == KVManagerMode::SMART_MASK)
+    write_ptr += past_size;
+
+  for (int i = 0; i < n_iter; ++i) {
+    std::memcpy(write_ptr, read_ptr, copy_size);
+    write_ptr += iter_size;
+    read_ptr += out_size;
+  }
+}
+
+void KVManager::update_value(
+    KVCache& v_cache,
+    int32_t n_past,
+    int32_t n_update) {
+  // Value cache doesn't need to copy for SHIFT_POINTER mode
+  if (kv_updater_ == KVManagerMode::SHIFT_POINTER)
+    return;
+
+  uint8_t* write_ptr = v_cache.buffer;
+  uint8_t* read_ptr = v_cache.output_buffer;
+  const int32_t copy_size = n_update * metadata_.head_dim * sizeof(uint8_t);
+  const int32_t past_size = n_past * metadata_.head_dim;
+
+  if (kv_updater_ == KVManagerMode::SMART_MASK)
+    write_ptr += past_size;
+
+  std::memcpy(write_ptr, read_ptr, copy_size);
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
new file mode 100644
index 00000000000..1a3beb35f97
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace example {
+
+// Structure to hold key-value cache buffers
+struct KVCache {
+  uint8_t* buffer;
+  uint8_t* output_buffer;
+};
+
+// Enumeration for key-value manager modes
+enum KVManagerMode { SMART_MASK = 0x0, SHIFT_POINTER = 0x1 };
+/**
+ * @class KVManager
+ * @brief Class for kv cache update, rearrangement, and buffer allocatation.
+ */
+class KVManager {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t head_dim;
+    int32_t max_ar_len;
+    int32_t max_cache_len;
+    int64_t num_heads;
+    int64_t num_layers;
+  };
+  KVManager(KVManagerMode kv_updater, Metadata metadata);
+
+  /**
+   * @brief Allocate buffer for KV cache and set the cur_ar_len_.
+   * @param buffer_manager Pointer to IMemAlloc instance which depends on
+   * kv_updater.
+   * @param ar_len Length of input tokens.
+   */
+  void init_cache(IMemAlloc* buffer_manager, int32_t ar_len);
+
+  /**
+   * @brief Switch key and value cache from AR-cur to AR-dst.
+   * @param ar_len_dst Target length of input tokens.
+   */
+  void rearrange_cache(int32_t ar_len_dst);
+
+  /**
+   * @brief Initialize attention mask based on kv manager mode, and attention
+   * map.
+   * For example,
+   * ar_len = 4, CL = 6, n_past = 0,
+   * attention map: {-1, 0, 1, 2} and SMART_MASK.
+   * Attention_mask will be:
+   * [     0     0 65535     0     0     0 ]
+   * [     0     0 65535 65535     0     0 ]
+   * [     0     0 65535 65535 65535     0 ]
+   * [     0     0 65535 65535 65535 65535 ]
+   * @param attention_mask Pointer to the attention mask array to be
+   * initialized.
+   * @param attention_map Vector containing the attention map values. The shape
+   * of attention map should be [ar_len].
+   * @param ar_len Length of input tokens.
+   * @param n_past Number of past elements in the cache.
+   */
+  void init_attention_mask(
+      uint16_t* attention_mask,
+      const std::vector<int32_t>& attention_map,
+      int32_t ar_len,
+      int32_t n_past);
+
+  /**
+   * @brief Update attention mask based on kv manager mode, and n_update.
+   * @param attention_mask Pointer to the attention mask array to be
+   * initialized.
+   * @param ar_len Length of input tokens.
+   * @param n_past Number of past elements in the cache.
+   * @param n_update Number of elements to be updated.
+   */
+  void update_attention_mask(
+      uint16_t* attention_mask,
+      int32_t ar_len,
+      int32_t n_past,
+      int32_t n_update);
+
+  /**
+   * @brief Reset the data pointer of the I/O cache tensor based on number of
+   * past cache, kv manager mode, current ar length and KV cache data pointer
+   * for SHIFT_POINTER mode.
+   * @param k_cache_in Reference to the input key cache TensorImpl vector.
+   * @param k_cache_out Reference to the output key cache TensorImpl vector.
+   * @param v_cache_in Reference to the input value cache TensorImpl vector.
+   * @param v_cache_out Reference to the output value cache TensorImpl vector.
+   * @param ar_len Length of input tokens.
+   * @param n_past Number of past elements in the cache.
+   * @return Returns true if the data pointer is updated; otherwise, returns
+   * false.
+   */
+  bool update_cache_tensor(
+      std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+          k_cache_in,
+      std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+          k_cache_out,
+      std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+          v_cache_in,
+      std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>&
+          v_cache_out,
+      int32_t ar_len,
+      int32_t n_past);
+
+  /**
+   * @brief Based on cur_ar_len_ to update cache
+   * @param ar_len Length of input tokens.
+   * @param n_past Number of past elements in the cache.
+   * @param n_update Number of elements to be updated.
+   */
+  void update_cache(int32_t ar_len, int32_t n_past, int32_t n_update);
+
+  const std::vector<std::vector<KVCache>>& get_k_cache_() const {
+    return k_cache_;
+  }
+  const std::vector<std::vector<KVCache>>& get_v_cache_() const {
+    return v_cache_;
+  }
+
+  inline const size_t total_cache_size_in_bytes() const {
+    return total_cache_size_;
+  }
+
+ private:
+  // Helper functions to rearrange and update key and value caches
+  void rearrange_key(KVCache& k_cache, int32_t ar_len_dst);
+  void rearrange_value(KVCache& v_cache, int32_t ar_len_dst);
+  void update_key(KVCache& k_cache, int32_t n_past, int32_t n_update);
+  void update_value(KVCache& v_cache, int32_t n_past, int32_t n_update);
+  KVManagerMode kv_updater_;
+
+  // metadata
+  Metadata metadata_;
+  size_t total_cache_size_;
+  int32_t cur_ar_len_;
+  // Store start pointer of k and v cache for input and output
+  // input: layer -> head -> head_dim * max_cache_len
+  // output: layer -> head -> head_dim * max_ar_len
+  std::vector<std::vector<KVCache>> k_cache_;
+  std::vector<std::vector<KVCache>> v_cache_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
new file mode 100644
index 00000000000..37dce8f06c4
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
+#include <numeric>
+using executorch::aten::TensorImpl;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+PromptProcessor::PromptProcessor(
+    DecoderRunner* decoder_runner,
+    KVManager* kv_manager,
+    const std::string& method_name,
+    Metadata metadata)
+    : decoder_runner_(decoder_runner),
+      kv_manager_(kv_manager),
+      method_name_(method_name),
+      metadata_(metadata) {
+  k_cache_in_.resize(metadata_.num_layers);
+  v_cache_in_.resize(metadata_.num_layers);
+  k_cache_out_.resize(metadata_.num_layers);
+  v_cache_out_.resize(metadata_.num_layers);
+  // Calculate I/O size
+  input_toks_.size = metadata_.ar_len * sizeof(int64_t);
+  if (is_bert())
+    input_pos_.size = 0;
+  else
+    input_pos_.size = metadata_.ar_len * sizeof(int32_t);
+  attention_mask_.size =
+      metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
+  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t);
+};
+void PromptProcessor::init_io(
+    IMemAlloc* buffer_manager,
+    Result<MethodMeta> method_meta) {
+  input_tensors_.reserve(method_meta->num_inputs());
+  output_tensors_.reserve(method_meta->num_outputs());
+  // [I]: input_tokens
+  Result<TensorInfo> input_toks = method_meta->input_tensor_meta(0);
+  input_toks_.data =
+      reinterpret_cast<int64_t*>(buffer_manager->allocate(input_toks_.size));
+  input_toks_.tensor = std::make_unique<TensorImpl>(
+      input_toks->scalar_type(),
+      input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_toks->sizes().data()),
+      input_toks_.data,
+      const_cast<TensorImpl::DimOrderType*>(input_toks->dim_order().data()));
+  input_tensors_.emplace_back(input_toks_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_toks_.data, input_toks_.size, input_toks.get());
+
+  // [I]: attention_mask
+  Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(1);
+  attention_mask_.data = reinterpret_cast<uint16_t*>(
+      buffer_manager->allocate(attention_mask_.size));
+  attention_mask_.tensor = std::make_unique<TensorImpl>(
+      attention_mask->scalar_type(),
+      attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(attention_mask->sizes().data()),
+      attention_mask_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          attention_mask->dim_order().data()));
+  input_tensors_.emplace_back(attention_mask_.tensor.get());
+  buffer_manager->add_memory_info(
+      attention_mask_.data, attention_mask_.size, attention_mask.get());
+
+  if (!is_bert()) {
+    // [I]: input_pos
+    Result<TensorInfo> input_pos = method_meta->input_tensor_meta(2);
+    input_pos_.data =
+        reinterpret_cast<int32_t*>(buffer_manager->allocate(input_pos_.size));
+    input_pos_.tensor = std::make_unique<TensorImpl>(
+        input_pos->scalar_type(),
+        input_pos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+        input_pos_.data,
+        const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+    input_tensors_.emplace_back(input_pos_.tensor.get());
+    buffer_manager->add_memory_info(
+        input_pos_.data, input_pos_.size, input_pos.get());
+
+    // [I] kv_cache
+    int index = 3; // bypass input_tokens, atten_mask, input_pos
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      std::vector<std::vector<std::unique_ptr<TensorImpl>>>& cache =
+          (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+      std::vector<std::vector<KVCache>> cache_ptrs = (cache_group == 0)
+          ? kv_manager_->get_k_cache_()
+          : kv_manager_->get_v_cache_();
+      for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+        for (int head = 0; head < metadata_.num_heads; ++head, ++index) {
+          Result<TensorInfo> kv_cache = method_meta->input_tensor_meta(index);
+
+          uint8_t* cache_ptr = cache_ptrs[layer][head].buffer;
+
+          cache[layer].emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          input_tensors_.emplace_back(cache[layer][head].get());
+          buffer_manager->add_memory_info(
+              cache_ptr, cache[layer][head]->nbytes(), kv_cache.get());
+        }
+      }
+    }
+  }
+
+  // [O]: logits
+  Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+  logits_.data =
+      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
+  logits_.tensor = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      logits_.data,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_.emplace_back(logits_.tensor.get());
+  buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get());
+
+  // [O] kv_cache
+  int index = 1;
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::vector<std::unique_ptr<TensorImpl>>>& cache =
+        (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+    std::vector<std::vector<KVCache>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      for (int head = 0; head < metadata_.num_heads; ++head, ++index) {
+        Result<TensorInfo> kv_cache = method_meta->output_tensor_meta(index);
+        uint8_t* cache_ptr = cache_ptrs[layer][head].output_buffer;
+        cache[layer].emplace_back(std::make_unique<TensorImpl>(
+            kv_cache->scalar_type(),
+            kv_cache->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+            cache_ptr,
+            const_cast<TensorImpl::DimOrderType*>(
+                kv_cache->dim_order().data())));
+        output_tensors_.emplace_back(cache[layer][head].get());
+        buffer_manager->add_memory_info(
+            cache_ptr, cache[layer][head]->nbytes(), kv_cache.get());
+      }
+    }
+  }
+  // Prepare the vector of EValue to run inference
+  inputs_.reserve(input_tensors_.size());
+  for (auto& input_tensor : input_tensors_) {
+    inputs_.emplace_back(std::move(input_tensor));
+  }
+}
+
+void PromptProcessor::prepare_io(
+    const std::vector<uint64_t>& prompt_tokens,
+    int64_t prompt_pos,
+    int64_t start_pos) {
+  for (int i = 0; i < metadata_.ar_len; i++) {
+    if (!is_bert()) {
+      // Prepare pos data
+      input_pos_.data[i] = start_pos + i;
+    }
+
+    // Prepare input token data
+    if (prompt_pos + i < prompt_tokens.size()) {
+      // Support CPU 4-bit embedding, which requires int64 input.
+      // However, for QNN embedding, only int32 input is needed.
+      // Therefore, we need to cast to the correct type to write the data.
+      if (metadata_.use_int64_token) {
+        input_toks_.data[i] = prompt_tokens[prompt_pos + i];
+      } else {
+        int32_t* input_toks_ptr = reinterpret_cast<int32_t*>(input_toks_.data);
+        input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[prompt_pos + i]);
+      }
+    }
+  }
+}
+
+Result<uint64_t> PromptProcessor::prefill(
+    std::vector<uint64_t> prompt_tokens,
+    int64_t start_pos) {
+  ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
+
+  // Calculate number of blocks
+  int32_t num_prompt_tokens = prompt_tokens.size();
+  if (!is_bert()) {
+    ET_CHECK_MSG(
+        (start_pos + num_prompt_tokens) <=
+            (metadata_.context_len - metadata_.ar_len),
+        "The sequence length exceeds the maximum limit that the prompt processor can handle.");
+  } else {
+    ET_CHECK_MSG(
+        start_pos == 0, "Bert model doesn't support multi-turn conversation.");
+  }
+
+  // store the token
+  int64_t cur_token;
+  int64_t prompt_pos = 0;
+  int64_t pos = start_pos;
+  int32_t n_update = metadata_.ar_len;
+  int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len);
+  ET_LOG(
+      Info,
+      "Prompt Processor: total %d prompt tokens (AR-%d * %d iters)",
+      num_prompt_tokens,
+      metadata_.ar_len,
+      num_iters);
+
+  // Rearrange KV cache first
+  kv_manager_->rearrange_cache(metadata_.ar_len);
+  std::vector<int32_t> attention_map(metadata_.ar_len);
+  std::iota(attention_map.begin(), attention_map.end(), -1);
+  // Initialize attention mask with current position
+  kv_manager_->init_attention_mask(
+      attention_mask_.data, attention_map, metadata_.ar_len, pos);
+  // Initialize the output of the module
+  ET_CHECK_MSG(
+      decoder_runner_->set_outputs(method_name_, output_tensors_) ==
+          executorch::runtime::Error::Ok,
+      "Failed to set output tensor for module %s",
+      method_name_.c_str());
+  for (int i = 0; i < num_iters; ++i) {
+    // Fill in the token and position data
+    prepare_io(prompt_tokens, prompt_pos, pos);
+    // Only update data pointer of the cache to the tensor for SHIFT_POINTER
+    // mode
+    bool updated = kv_manager_->update_cache_tensor(
+        k_cache_in_,
+        k_cache_out_,
+        v_cache_in_,
+        v_cache_out_,
+        metadata_.ar_len,
+        pos);
+    // Only update the output of module for SHIFT_POINTER mode
+    if (updated) {
+      // Update the output of the module
+      ET_CHECK_MSG(
+          decoder_runner_->set_outputs(method_name_, output_tensors_) ==
+              executorch::runtime::Error::Ok,
+          "Failed to set output tensor for module %s",
+          method_name_.c_str());
+    }
+    // Run inference
+    decoder_runner_->step(method_name_, inputs_);
+    // In the last run, offset to the meaningful logits.
+    if (i == num_iters - 1) {
+      n_update = 1 + ((num_prompt_tokens - 1) % metadata_.ar_len);
+    }
+    // Update KV Cache with the output results
+    kv_manager_->update_cache(metadata_.ar_len, pos, n_update);
+    // Update attention mask with current position
+    kv_manager_->update_attention_mask(
+        attention_mask_.data, metadata_.ar_len, pos, n_update);
+    prompt_pos += metadata_.ar_len;
+    pos += metadata_.ar_len;
+  }
+
+  cur_token = decoder_runner_->logits_to_token(
+      output_tensors_[0],
+      (num_prompt_tokens + metadata_.ar_len - 1) % metadata_.ar_len);
+  return cur_token;
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
new file mode 100644
index 00000000000..a9991a6c79a
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
+#include <memory>
+#include <string>
+
+namespace example {
+/**
+ * @class PromptProcessor
+ * @brief Class for processing prompts using decoder and key-value manager.
+ */
+class PromptProcessor {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+  };
+  PromptProcessor(
+      DecoderRunner* decoder_runner,
+      KVManager* kv_manager,
+      const std::string& method_name,
+      Metadata metadata);
+
+  /**
+   * @brief Initialize I/O tensor and allocate I/O data buffer.
+   * @param buffer_manager Pointer to IMemAlloc instance which depends on
+   * kv_updater.
+   * @param method_meta Method metadata.
+   */
+  void init_io(
+      IMemAlloc* buffer_manager,
+      executorch::runtime::Result<executorch::runtime::MethodMeta> method_meta);
+
+  /**
+   * Prefill an LLM Module with the given text input.
+   * @param prompt_tokens The text prompt tokens to the LLM Module. Encoded by
+   * tokenizer.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * Module.
+   * @return The next token of the LLM Module after prefill.
+   */
+  executorch::runtime::Result<uint64_t> prefill(
+      std::vector<uint64_t> prompt_tokens,
+      int64_t start_pos);
+  /**
+   * @brief Get total I/O size in bytes (excluding the KV cache size)
+   * @return Total I/O size in bytes.
+   */
+  inline const size_t total_prompt_processor_io_size_in_bytes() const {
+    return input_toks_.size + input_pos_.size + attention_mask_.size +
+        logits_.size;
+  }
+
+ private:
+  // If the cache length is zero, it indicates a BERT model, which does not use
+  // position ids or KV cache inputs.
+  bool is_bert() const {
+    return metadata_.context_len == metadata_.ar_len;
+  }
+  /**
+   * @brief Fill in I/O buffers with prompt token and position.
+   * @param prompt_tokens Vector of prompt tokens.
+   * @param prompt_pos Position of the prompt.
+   * @param start_pos Starting position.
+   */
+  void prepare_io(
+      const std::vector<uint64_t>& prompt_tokens,
+      int64_t prompt_pos,
+      int64_t start_pos);
+  DecoderRunner* decoder_runner_;
+  KVManager* kv_manager_;
+  std::string method_name_;
+
+  // metadata
+  Metadata metadata_;
+
+  // inputs and outputs
+  TensorStruct<int64_t> input_toks_;
+  TensorStruct<int32_t> input_pos_;
+  TensorStruct<uint16_t> attention_mask_;
+  TensorStruct<uint16_t> logits_;
+
+  // layer -> head -> TensorImpl
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_in_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_in_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_out_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_out_;
+
+  std::vector<executorch::runtime::EValue> inputs_;
+  std::vector<executorch::aten::Tensor> input_tensors_;
+  std::vector<executorch::aten::Tensor> output_tensors_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
new file mode 100644
index 00000000000..f0cc6d9a7a2
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h>
+#include <executorch/runtime/core/memory_allocator.h>
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+RpcMem::RpcMem(
+    const size_t total_cache_size,
+    const size_t total_prompt_processor_io_size,
+    const size_t total_token_generator_io_size)
+    : calculated_offsets_(0) {
+  size_t total_bytes = total_cache_size + total_prompt_processor_io_size +
+      total_token_generator_io_size;
+  shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem(
+      total_bytes, MemoryAllocator::kDefaultAlignment);
+}
+RpcMem::~RpcMem() {
+  QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_);
+}
+
+std::byte* RpcMem::allocate(size_t data_size) {
+  std::byte* data_ptr = static_cast<std::byte*>(shared_buffer_base_ptr_);
+  data_ptr += calculated_offsets_;
+  // Record the position of the data pointer
+  io_pos_map_[data_ptr] = calculated_offsets_;
+  calculated_offsets_ += data_size;
+  return data_ptr;
+}
+
+void RpcMem::add_memory_info(
+    void* data_ptr,
+    size_t data_size,
+    TensorInfo tensor_info) {
+  if (auto it = io_pos_map_.find(static_cast<std::byte*>(data_ptr));
+      it == io_pos_map_.end()) {
+    ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
+  }
+  size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
+  uint32_t* shape = const_cast<uint32_t*>(
+      reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
+  uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
+  executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
+  CustomMemTensorInfo info = {
+      shared_buffer_base_ptr_,
+      data_ptr,
+      pos,
+      data_size,
+      shape,
+      rank,
+      scalar_type};
+  QnnExecuTorchAddCustomMemTensorInfo(info);
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
new file mode 100644
index 00000000000..d8da945cb96
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+
+namespace example {
+/**
+ * @class RpcMem
+ * @brief Final class for rpc memory allocation, implementing IMemAlloc
+ * interface. Used for SMART_MASK mode.
+ */
+class RpcMem final : public IMemAlloc {
+ public:
+  /**
+   * @brief Constructor to allocate RpcMem with total sizes.
+   * @param total_cache_size Total size of the cache.
+   * @param total_prompt_processor_io_size Total size for prompt processor I/O.
+   * @param total_token_generator_io_size Total size for token generator I/O.
+   */
+  RpcMem(
+      const size_t total_cache_size,
+      const size_t total_prompt_processor_io_size,
+      const size_t total_token_generator_io_size);
+  // Disable copy constructors, r-value referencing, etc
+  RpcMem(const RpcMem&) = delete;
+  RpcMem& operator=(const RpcMem&) = delete;
+  RpcMem(RpcMem&&) = delete;
+  RpcMem& operator=(RpcMem&&) = delete;
+  virtual ~RpcMem();
+  /**
+   * @brief Allocate buffer of specified size with shared_buffer_base_ptr_.
+   * @param data_size Size of the data to allocate.
+   * @return Pointer to the allocated buffer.
+   */
+  std::byte* allocate(size_t size) override;
+
+  /**
+   * @brief Add memory information into QNN Backend to register RpcMem to the
+tensor.
+   * @param data_ptr Pointer to the data.
+   * @param data_size Size of the data.
+   * @param tensor_info Information about the tensor.
+   */
+  void add_memory_info(
+      void* data_ptr,
+      size_t data_size,
+      executorch::runtime::TensorInfo tensor_info) override;
+
+ private:
+  // shared buffer
+  void* shared_buffer_base_ptr_;
+  size_t calculated_offsets_;
+  std::unordered_map<std::byte*, size_t> io_pos_map_;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index dafc911a172..d348878294a 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -10,72 +10,76 @@
 // logic. The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/client_mem.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
-#include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
 
-#include <ctime>
+#include <algorithm>
 #include <fstream>
-#include <sstream>
 
-using executorch::aten::Tensor;
 using executorch::extension::Module;
-using executorch::extension::llm::Sampler;
+using executorch::extension::llm::get_rss_bytes;
+using executorch::extension::llm::print_report;
+using executorch::extension::llm::Stats;
 using executorch::extension::llm::time_in_ms;
 using executorch::runtime::Error;
-using executorch::runtime::EValue;
 using executorch::runtime::MethodMeta;
 using executorch::runtime::Result;
 
 namespace example {
-
 namespace {
-static constexpr auto kTopp = 0.9f;
-void printReport(
-    const Runner::Stats& stats,
-    const std::string& performance_output_path);
-std::string statsToJsonString(const Runner::Stats& stats);
+void print_performance_report(
+    const Stats& stats,
+    const std::string& performance_output_path) {
+  // For now, we just print the total inference time for CI, can save more info
+  // in future if needed.
+  std::ofstream outfile(performance_output_path.c_str());
+  if (outfile.is_open()) {
+    double num_tok = (stats.num_generated_tokens) /
+        (double)(stats.inference_end_ms - stats.inference_start_ms) *
+        stats.SCALING_FACTOR_UNITS_PER_SECOND;
+    outfile << num_tok;
+    outfile.close();
+  } else {
+    ET_CHECK_MSG(false, "Error saving the inference speed file");
+  }
+}
 } // namespace
 
 Runner::Runner(
-    const std::vector<std::string>& models_path,
+    const std::string& model_path,
     const std::string& tokenizer_path,
     const std::string& performance_output_path,
-    const float logits_scale,
-    const int32_t logits_offset,
     const float temperature,
     const int eval_mode,
-    const std::string& kv_updater,
-    const int num_iters)
-    : n_bos_(1),
-      n_eos_(1),
-      tokenizer_path_(tokenizer_path),
+    const std::string& kv_updater)
+    : tokenizer_path_(tokenizer_path),
       performance_output_path_(performance_output_path),
-      logits_scale_(logits_scale),
-      logits_offset_(logits_offset),
       temperature_(temperature),
-      eval_mode_(static_cast<EvalMode>(eval_mode)),
-      kv_updater_(kv_updater),
-      num_iters_(num_iters) {
-  for (size_t i = 0; i < models_path.size(); ++i) {
-    modules_.push_back(std::make_shared<Module>(
-        models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
-    ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str());
+      eval_mode_(static_cast<EvalMode>(eval_mode)) {
+  module_ = std::make_unique<Module>(
+      model_path, Module::LoadMode::MmapUseMlockIgnoreErrors);
+  if (kv_updater == "SmartMask") {
+    kv_updater_ = KVManagerMode::SMART_MASK;
+  } else if (kv_updater == "ShiftPointer") {
+    kv_updater_ = KVManagerMode::SHIFT_POINTER;
+  } else {
+    ET_CHECK_MSG(false, "kv updater (%s) not found", kv_updater.c_str());
   }
+  ET_LOG(Info, "creating module: model_path=%s", model_path.c_str());
   ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
   ET_LOG(Info, "eval mode=%d", eval_mode_);
+  ET_LOG(Info, "kv updater=%s", kv_updater.c_str());
 }
 
 bool Runner::is_loaded() const {
-  bool loaded = true;
-  for (const std::shared_ptr<Module>& module : modules_) {
-    loaded &= module->is_loaded();
-  }
-  return loaded && tokenizer_ && sampler_;
+  return module_->is_loaded() && tokenizer_ && decoder_runner_ &&
+      prompt_processor_ && token_generator_ && kv_manager_ && buffer_manager_;
 }
 
 Error Runner::load() {
@@ -83,123 +87,37 @@ Error Runner::load() {
     return Error::Ok;
   }
 
+  std::string token_generator_method_name, prompt_processor_method_name;
+  std::vector<std::string> method_names;
   switch (eval_mode_) {
     case EvalMode::kKVCached:
-      kv_forward_name_ = "forward";
-      method_names_.emplace_back(kv_forward_name_);
+      prompt_processor_method_name = "forward";
+      token_generator_method_name = "forward";
+      method_names.emplace_back(token_generator_method_name);
       break;
     case EvalMode::kHybrid:
-      prefill_forward_name_ = "prefill_forward";
-      kv_forward_name_ = "kv_forward";
-      method_names_.emplace_back(prefill_forward_name_);
-      method_names_.emplace_back(kv_forward_name_);
+      prompt_processor_method_name = "prefill_forward";
+      token_generator_method_name = "kv_forward";
+      method_names.emplace_back(prompt_processor_method_name);
+      method_names.emplace_back(token_generator_method_name);
       break;
     case EvalMode::kUnsupported:
-      ET_CHECK_MSG(false, "Unsupported llama version");
+      ET_CHECK_MSG(false, "Unsupported llama evaluation mode");
       break;
   }
 
-  for (std::shared_ptr<Module>& module : modules_) {
-    if (!prefill_forward_name_.empty()) {
-      ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(prefill_forward_name_));
-    }
-    if (!kv_forward_name_.empty()) {
-      ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kv_forward_name_));
-    }
-  }
-
-  if (!prefill_forward_name_.empty()) {
-    // Use attention mask length to retrieve prefill_ar_len and context length
-    // Prefill cache length equals to context_len - prefill_ar_len
-    auto atten_mask_meta =
-        get_methods_meta(prefill_forward_name_)[0]->input_tensor_meta(1);
-    prefill_ar_len_ = atten_mask_meta->sizes()[1];
-    context_len_ = atten_mask_meta->sizes()[2];
-    prefill_cache_len_ = context_len_ - prefill_ar_len_;
-  }
-  if (!kv_forward_name_.empty()) {
-    // Use attention mask length to retrieve kv ar len and context length
-    // Cache len equals to kv model context_len - kv_ar_len
-    auto atten_mask_meta =
-        get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(1);
-    kv_ar_len_ = atten_mask_meta->sizes()[1];
-    context_len_ = atten_mask_meta->sizes()[2];
-    kv_cache_len_ = context_len_ - kv_ar_len_;
-  }
-
-  // retrieve any method meta, can be either prefill or kv
-  // Try avoid getMetadataHelper as it is time consuming.
-  auto method_meta = get_methods_meta(method_names_[0])[0].get();
-  int64_t num_layers = getMetadataHelper<int64_t>("get_n_layers", -1);
-  int64_t head_dim = method_meta.output_tensor_meta(1)->sizes()[1]; // k_cache
-  int64_t num_heads = (method_meta.num_outputs() - 1) / (num_layers * 2);
-  vocab_size_ = method_meta.output_tensor_meta(0)->sizes()[2]; // logit_tensor
-  use_int64_token_ = method_meta.input_tensor_meta(0)->scalar_type() ==
-      executorch::aten::ScalarType::Long;
-  ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
-
-  if (kv_updater_ == "SmartMask") {
-    io_mgr_ = std::make_unique<SmartMaskIoMgr>(
-        modules_,
-        context_len_,
-        prefill_ar_len_,
-        prefill_cache_len_,
-        kv_ar_len_,
-        kv_cache_len_,
-        vocab_size_,
-        num_layers,
-        head_dim,
-        num_heads,
-        eval_mode_,
-        prefill_forward_name_,
-        kv_forward_name_,
-        use_int64_token_);
-  } else if (kv_updater_ == "ShiftPointer") {
-    io_mgr_ = std::make_unique<ShiftPointerIoMgr>(
-        modules_,
-        context_len_,
-        prefill_ar_len_,
-        prefill_cache_len_,
-        kv_ar_len_,
-        kv_cache_len_,
-        vocab_size_,
-        num_layers,
-        head_dim,
-        num_heads,
-        eval_mode_,
-        prefill_forward_name_,
-        kv_forward_name_,
-        use_int64_token_);
-  } else {
-    ET_LOG(Error, "Using an unknown updater %s", kv_updater_.c_str());
-  }
-  ET_LOG(Info, "creating io_memory");
-
-  // prepare io
-  io_mgr_->init_io();
-  switch (eval_mode_) {
-    case EvalMode::kKVCached:
-      io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_));
-      break;
-    case EvalMode::kHybrid:
-      io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
-      io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_));
-      break;
-    case EvalMode::kUnsupported:
-      ET_CHECK_MSG(false, "unsupported mode");
-      break;
-  }
-
-  // llama3 tokenizer
-  tokenizer_ = example::get_tiktoken_for_llama();
+  // load tokenizer. Assuming tiktoken is the default tokenizer
+  tokenizer_ = get_tiktoken_for_llama();
   auto err = tokenizer_->load(tokenizer_path_);
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  // Rely on tiktoken to throw error if the artifact is incompatible. Then we
+  // fallback to BPE tokenizer.
   if (err != tokenizers::Error::Ok) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    // llama2 tokenizer
     tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
     err = tokenizer_->load(tokenizer_path_);
     llama_version_ = LlamaVersion::kLlama2;
@@ -208,354 +126,193 @@ Error Runner::load() {
         "failed to load tokenizer %s",
         tokenizer_path_.c_str());
   } else {
-    eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+    eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
     llama_version_ = LlamaVersion::kLlama3;
   }
-  bos_id_ = tokenizer_->bos_tok();
-  eos_id_.insert(tokenizer_->eos_tok());
+  eos_ids->insert(tokenizer_->eos_tok());
+  int32_t vocab_size = tokenizer_->vocab_size();
+  decoder_runner_ =
+      std::make_unique<DecoderRunner>(module_.get(), vocab_size, temperature_);
 
-  // create sampler
-  sampler_ = std::make_unique<Sampler>(
-      vocab_size_,
-      temperature_,
-      kTopp,
-      static_cast<unsigned long long>(std::time(nullptr)));
+  ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names));
 
-  return Error::Ok;
-}
+  ET_LOG(Info, "Reading metadata from model");
+  // Try avoid getMetadataHelper as it is time consuming.
+  Result<MethodMeta> method_meta =
+      module_->method_meta(token_generator_method_name);
+  // retrieve any method meta, can be either prefill or kv
+  int64_t num_layers =
+      ET_UNWRAP(module_->get("get_n_layers")).toScalar().to<int64_t>();
+  ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
+  // k_cache: [1, head_dim, seq_len]
+  int64_t head_dim = method_meta->output_tensor_meta(1)->sizes()[1];
+  int64_t num_heads = (method_meta->num_outputs() - 1) / (num_layers * 2);
+  bool use_int64_token = method_meta->input_tensor_meta(0)->scalar_type() ==
+      executorch::aten::ScalarType::Long;
 
-template <typename T>
-T Runner::getMetadataHelper(std::string method_name, T default_val) {
-  T res = default_val;
-  if (modules_[0]->method_names()->count(method_name)) {
-    Result<std::vector<EValue>> outputs = modules_[0]->execute(method_name);
-    if (outputs.ok()) {
-      std::vector<EValue> outs = outputs.get();
-      if (outs.size() > 0) {
-        res = outs[0].to<T>();
-      }
-    }
-  } else {
-    ET_LOG(
-        Info,
-        "The model does not contain %s method, using default value %lld",
-        method_name.c_str(),
-        (long long)default_val);
+  // Use attention mask length to retrieve AR length and context length
+  // Cache len equals to context_len - ar_len
+  int32_t prompt_processor_ar_len, token_generator_ar_len, max_cache_len,
+      max_ar_len;
+  // atten mask: [1, AR-N, CL]
+  auto atten_mask_meta_token = method_meta->input_tensor_meta(1);
+  token_generator_ar_len = atten_mask_meta_token->sizes()[1];
+  context_len_ = atten_mask_meta_token->sizes()[2];
+  if (eval_mode_ == EvalMode::kKVCached) {
+    prompt_processor_ar_len = token_generator_ar_len;
+  } else if (eval_mode_ == EvalMode::kHybrid) {
+    auto atten_mask_meta_prompt =
+        module_->method_meta(prompt_processor_method_name)
+            ->input_tensor_meta(1);
+    prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1];
   }
-  return res;
-}
-
-int32_t Runner::logitsToToken(const Tensor& logits_tensor, int64_t pos) {
-  static std::vector<float> logits_f(vocab_size_);
-  const uint16_t* logits = logits_tensor.data_ptr<uint16_t>();
-  // Since the logits are for all tokens, get the last token probabilities
-  auto* logits_last = logits;
-
-  // offset to the meaningful logit we want.
-  if (logits_tensor.sizes().data()[1] > 1) {
-    logits_last += pos * vocab_size_;
+  if (prompt_processor_ar_len == context_len_)
+    max_cache_len = context_len_;
+  else
+    max_cache_len = context_len_ -
+        std::min(token_generator_ar_len, prompt_processor_ar_len);
+  max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len);
+
+  kv_manager_ = std::make_unique<KVManager>(
+      kv_updater_,
+      KVManager::Metadata{
+          context_len_,
+          head_dim,
+          max_ar_len,
+          max_cache_len,
+          num_heads,
+          num_layers});
+
+  prompt_processor_ = std::make_unique<PromptProcessor>(
+      decoder_runner_.get(),
+      kv_manager_.get(),
+      prompt_processor_method_name,
+      PromptProcessor::Metadata{
+          context_len_,
+          num_heads,
+          num_layers,
+          prompt_processor_ar_len,
+          vocab_size,
+          use_int64_token});
+  token_generator_ = std::make_unique<TokenGenerator>(
+      tokenizer_.get(),
+      decoder_runner_.get(),
+      kv_manager_.get(),
+      token_generator_method_name,
+      std::move(eos_ids),
+      TokenGenerator::Metadata{
+          context_len_,
+          num_heads,
+          num_layers,
+          token_generator_ar_len,
+          vocab_size,
+          use_int64_token,
+      },
+      &stats_);
+
+  buffer_manager_ = std::make_unique<ClientMem>();
+  if (kv_updater_ == KVManagerMode::SMART_MASK) {
+    buffer_manager_ = std::make_unique<RpcMem>(
+        kv_manager_->total_cache_size_in_bytes(),
+        prompt_processor_->total_prompt_processor_io_size_in_bytes(),
+        token_generator_->total_token_generator_io_size_in_bytes());
   }
 
-  // dequantize
-  for (int i = 0; i < vocab_size_; i++) {
-    logits_f[i] = (logits_last[i] - logits_offset_) * logits_scale_;
-  }
-  return sampler_->sample(logits_f.data());
-}
+  ET_LOG(Info, "creating io_memory");
+  // prepare io
+  kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len);
+  prompt_processor_->init_io(
+      buffer_manager_.get(),
+      module_->method_meta(prompt_processor_method_name));
+  token_generator_->init_io(
+      buffer_manager_.get(), module_->method_meta(token_generator_method_name));
 
-void Runner::run_model_step(
-    const std::string& method_name,
-    std::vector<std::vector<EValue>>& inputs) {
-  for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) {
-    Result<std::vector<EValue>> outputs_res =
-        modules_[i]->execute(method_name, inputs[i]);
-    ET_CHECK_MSG(
-        outputs_res.error() == Error::Ok, "shard %zu inference failed", i);
-  }
+  return Error::Ok;
 }
 
 Error Runner::generate(
-    int32_t seq_len,
     const std::string& prompt,
-    const std::string& system_prompt,
+    int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  std::unordered_map<std::string, std::vector<std::vector<Tensor>>>
-      input_tensors, output_tensors;
-  std::unordered_map<std::string, std::vector<std::vector<EValue>>> inputs;
-  if (!is_loaded() || (num_iters_ > 1)) {
+    std::function<void(const Stats&)> stats_callback,
+    bool echo,
+    bool warming) {
+  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
+  if (!is_loaded()) {
     stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    for (auto method_name : method_names_) {
-      for (int i = 0; i < modules_.size(); ++i) {
-        input_tensors[method_name].emplace_back(
-            io_mgr_->get_input_tensors(i, method_name));
-        output_tensors[method_name].emplace_back(
-            io_mgr_->get_output_tensors(i, method_name));
-        for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) {
-          ET_CHECK_MSG(
-              modules_[i]->set_output(
-                  method_name, output_tensors[method_name][i][j], j) ==
-                  Error::Ok,
-              "failed to set output tensor for module %d's %zu'th output",
-              i,
-              j);
-        }
-        inputs[method_name].emplace_back(std::vector<EValue>(
-            begin(input_tensors[method_name][i]),
-            end(input_tensors[method_name][i])));
-      }
-    }
+    stats_.model_load_end_ms = time_in_ms();
   }
-  stats_.model_load_end_ms = time_in_ms();
   stats_.inference_start_ms = time_in_ms();
 
-  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
-
-  switch (llama_version_) {
-    case LlamaVersion::kLlama2:
-      prompt_.append(prompt);
-      break;
-    case LlamaVersion::kLlama3:
-      if (!system_prompt.empty()) {
-        prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
-        prompt_.append(system_prompt);
-        prompt_.append("<|eot_id|>");
-      }
-      prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
-      prompt_.append(prompt);
-      prompt_.append(
-          "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
-      if (token_callback) {
-        token_callback("<|begin_of_text|>");
-      }
-      break;
-    default:
-      ET_CHECK_MSG(false, "unsupported llama version");
-      break;
-  }
-
   seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
   tokenizers::Result<std::vector<uint64_t>> encode_res =
-      tokenizer_->encode(prompt_, n_bos_, 0);
+      tokenizer_->encode(prompt, n_bos, 0);
   ET_CHECK_TK_OK_OR_RETURN_ERROR(
-      encode_res.error(), "failed to encode prompt %s", prompt_.c_str());
+      encode_res.error(), "failed to encode prompt %s", prompt.c_str());
 
+  // encode the (string) prompt into tokens sequence
   std::vector<uint64_t> prompt_tokens = encode_res.get();
   int num_prompt_tokens = prompt_tokens.size();
+  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
-      num_prompt_tokens < seq_len,
+      cur_pos_ + num_prompt_tokens < seq_len,
       "sequence length exceeded - please increase the seq_len value");
 
-  int64_t pos = 0, prev_token, cur_token = prompt_tokens[0];
+  // Prompt Processor first
   if (token_callback) {
-    token_callback(prompt_);
+    token_callback(prompt);
   }
-  auto prefill_execute = [&](const std::string& method_name) {
-    int num_iters = 1 + ((num_prompt_tokens - 1) / prefill_ar_len_);
-    ET_LOG(
-        Info,
-        "Prompt Processor: total %d tokens (AR-%d * %d iters)",
-        num_prompt_tokens,
-        prefill_ar_len_,
-        num_iters);
-
-    for (int i = 0; i < num_iters; i++) {
-      io_mgr_->fill_prefill_toks(pos, prompt_tokens);
-      run_model_step(method_name, inputs[method_name]);
-      io_mgr_->update_prefill_io(cur_token, pos, output_tensors[method_name]);
-      pos += prefill_ar_len_;
-    }
-    Tensor& logits_tensor = output_tensors[method_name].back()[0];
-    prev_token = prompt_tokens[num_prompt_tokens - 1];
-    long sample_start_time_ms = time_in_ms();
-    cur_token = logitsToToken(
-        logits_tensor,
-        (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_);
-    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
-
-    auto piece_res = tokenizer_->decode(prev_token, cur_token);
-    ET_CHECK(piece_res.ok());
-    if (token_callback) {
-      token_callback(piece_res.get().c_str());
-    }
-
-    pos = num_prompt_tokens;
-    stats_.first_token_ms = time_in_ms();
-    stats_.prompt_eval_end_ms = time_in_ms();
-  };
-
-  auto kv_execute = [&](const std::string& method_name) {
-    io_mgr_->fill_kv_tok_mask(pos, cur_token);
-    while (pos < seq_len - 1) {
-      // inference
-      run_model_step(method_name, inputs[method_name]);
-      Tensor& logits_tensor = output_tensors[method_name].back()[0];
 
-      // hybrid mode will check these stats_ at prefill(prefill)
-      if (eval_mode_ == EvalMode::kKVCached) {
-        if (pos == num_prompt_tokens) {
-          stats_.first_token_ms = time_in_ms();
-        } else if (pos == num_prompt_tokens - 1) {
-          stats_.prompt_eval_end_ms = time_in_ms();
-        }
-      }
-      prev_token = cur_token;
-      long sample_start_time_ms = time_in_ms();
-      cur_token = logitsToToken(logits_tensor, pos);
-      stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
+  auto prefill_res = prompt_processor_->prefill(prompt_tokens, cur_pos_);
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+  uint64_t cur_token = prefill_res.get();
+  cur_pos_ += num_prompt_tokens;
+  stats_.first_token_ms = time_in_ms();
+  stats_.prompt_eval_end_ms = time_in_ms();
 
-      if (pos < num_prompt_tokens - 1) {
-        cur_token = prompt_tokens[pos + 1];
-      }
-      io_mgr_->update_kv_io(cur_token, ++pos, output_tensors[method_name]);
-      auto piece_res = tokenizer_->decode(prev_token, cur_token);
-      ET_CHECK(piece_res.ok());
-
-      if (token_callback && pos >= num_prompt_tokens) {
-        token_callback(piece_res.get().c_str());
-      }
-
-      if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
-        ET_LOG(Info, "\nReached to the end of generation");
-        break;
-      }
-    }
-  };
-
-  switch (eval_mode_) {
-    case EvalMode::kKVCached:
-      kv_execute(kv_forward_name_);
-      break;
-    case EvalMode::kHybrid:
-      prefill_execute(prefill_forward_name_);
-      io_mgr_->update_prefill_to_kv_io(
-          cur_token, pos, output_tensors[kv_forward_name_]);
-      kv_execute(kv_forward_name_);
-      break;
-    default:
-      ET_CHECK_MSG(false, "Unsupported eval mode");
-      break;
+  // print the first token from prefill. No prev_token so use cur_token for it.
+  if (token_callback) {
+    token_callback(
+        ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   }
+  ET_LOG(
+      Info,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // start the main loop
+  prompt_tokens.push_back(cur_token);
+  int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate(
+      prompt_tokens, cur_pos_, seq_len, token_callback));
   stats_.inference_end_ms = time_in_ms();
-  if (pos == seq_len) {
-    ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len);
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+  cur_pos_ += num_generated_tokens;
+  if (cur_pos_ == seq_len) {
+    ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
-  stats_.num_generated_tokens = pos - num_prompt_tokens;
-  printReport(stats_, performance_output_path_);
+  stats_.num_generated_tokens = num_generated_tokens;
+  print_report(stats_);
+  print_performance_report(stats_, performance_output_path_);
   if (stats_callback) {
     stats_callback(stats_);
   }
-  io_mgr_->reset_io(
-      get_methods_meta(prefill_forward_name_),
-      get_methods_meta(kv_forward_name_));
-  prompt_.clear();
   return Error::Ok;
 }
 
-namespace {
-void printReport(
-    const Runner::Stats& stats,
-    const std::string& performance_output_path) {
-  printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
-
-  ET_LOG(
-      Info,
-      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
-      stats.num_prompt_tokens,
-      stats.num_generated_tokens);
-
-  ET_LOG(
-      Info,
-      "\tModel Load Time:\t\t%f (seconds)",
-      ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-  double inference_time_ms =
-      (double)(stats.inference_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-
-      (stats.num_generated_tokens) /
-          (double)(stats.inference_end_ms - stats.inference_start_ms) *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-  double prompt_eval_time =
-      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      (stats.num_prompt_tokens) / prompt_eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  double eval_time =
-      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
-  ET_LOG(
-      Info,
-      "\t\tGenerated %" PRIu64
-      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      stats.num_generated_tokens,
-      eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      stats.num_generated_tokens / eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  // Time to first token is measured from the start of inference, excluding
-  // model load time.
-  ET_LOG(
-      Info,
-      "\tTime to first generated token:\t%f (seconds)",
-      ((double)(stats.first_token_ms - stats.inference_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-
-  ET_LOG(
-      Info,
-      "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
-      stats.num_generated_tokens,
-      (double)stats.aggregate_sampling_time_ms /
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  // For now, we just print the total inference time for CI, can save more info
-  // in future if needed.
-
-  std::ofstream outfile(performance_output_path.c_str());
-  if (outfile.is_open()) {
-    double num_tok = (stats.num_generated_tokens) /
-        (double)(stats.inference_end_ms - stats.inference_start_ms) *
-        stats.SCALING_FACTOR_UNITS_PER_SECOND;
-    outfile << num_tok;
-    outfile.close();
-  } else {
-    ET_CHECK_MSG(false, "Error saving the inference speed file");
+Result<LlamaVersion> Runner::get_llama_version() {
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = time_in_ms();
   }
+  return llama_version_;
 }
 
-std::string statsToJsonString(const Runner::Stats& stats) {
-  std::stringstream ss;
-  ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
-     << "\"generated_tokens\":" << stats.num_generated_tokens << ","
-     << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
-     << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
-     << "\"inference_start_ms\":" << stats.inference_start_ms << ","
-     << "\"inference_end_ms\":" << stats.inference_end_ms << ","
-     << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
-     << "\"first_token_ms\":" << stats.first_token_ms << ","
-     << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
-     << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
-  return ss.str();
-}
-} // namespace
-
-std::vector<Result<MethodMeta>> Runner::get_methods_meta(
-    std::string& method_name) {
-  std::vector<Result<MethodMeta>> methods_meta;
-  methods_meta.reserve(modules_.size());
-  for (std::shared_ptr<Module>& module : modules_) {
-    methods_meta.emplace_back(module->method_meta(method_name));
-  }
-  return methods_meta;
-}
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index e693bcd7077..708f91157a3 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -15,110 +15,70 @@
 #include <functional>
 #include <memory>
 #include <string>
-#include <unordered_map>
 
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/io_manager.h>
-#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
+#include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/module/module.h>
 #include <pytorch/tokenizers/tokenizer.h>
-
 namespace example {
 
+enum LlamaVersion {
+  kLlama2 = 0,
+  kLlama3,
+};
 class Runner {
  public:
   explicit Runner(
-      const std::vector<std::string>& models_path,
+      const std::string& model_path,
       const std::string& tokenizer_path,
-      const std::string& performance_output_path_,
-      const float logits_scale,
-      const int32_t logits_offset,
-      const float temperature,
-      const int eval_mode,
-      const std::string& kv_updater,
-      const int num_iters);
-
-  struct Stats {
-    // Scaling factor for timestamps - in this case, we use ms.
-    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
-    // Time stamps for the different stages of the execution
-    // model_load_start_ms: Start of model loading.
-    long model_load_start_ms;
-    // model_load_end_ms: End of model loading.
-    long model_load_end_ms;
-    // inference_start_ms: Immediately after the model is loaded (or we check
-    // for model load), measure the inference time.
-    long inference_start_ms;
-    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
-    // before the inference loop starts
-    long prompt_eval_end_ms;
-    // first_token: Timestamp when the first generated token is emitted
-    long first_token_ms;
-    // inference_end_ms: End of inference/generation.
-    long inference_end_ms;
-    // Keep a running total of the time spent in sampling.
-    long aggregate_sampling_time_ms;
-    // Token count from prompt
-    int64_t num_prompt_tokens;
-    // Token count from generated (total - prompt)
-    int64_t num_generated_tokens;
-  };
+      const std::string& performance_output_path,
+      const float temperature = 0.8f,
+      const int eval_mode = EvalMode::kKVCached,
+      const std::string& kv_updater = "SmartMask");
 
   bool is_loaded() const;
   executorch::runtime::Error load();
+  // TODO: Support echo and warming
   executorch::runtime::Error generate(
-      int32_t seq_len,
       const std::string& prompt,
-      const std::string& system_prompt,
+      int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
-  void stop();
-  std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
-  get_methods_meta(std::string& method_name);
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {},
+      bool echo = true,
+      bool warming = false);
+  void stop() {};
+  executorch::runtime::Result<LlamaVersion> get_llama_version();
 
  private:
-  enum LlamaVersion {
-    kLlama2 = 0,
-    kLlama3,
+  enum EvalMode {
+    kKVCached = 0,
+    kHybrid,
+    kUnsupported,
   };
-  template <typename T>
-  T getMetadataHelper(std::string method_name, T default_val);
-  int32_t logitsToToken(
-      const executorch::aten::Tensor& logits_tensor,
-      int64_t pos);
-  void run_model_step(
-      const std::string& method_name,
-      std::vector<std::vector<executorch::runtime::EValue>>& inputs);
-  std::string prompt_;
 
-  // metadata
+  std::unique_ptr<executorch::extension::Module> module_;
   int32_t context_len_{0};
-  int32_t prefill_ar_len_{0};
-  int32_t prefill_cache_len_{0};
-  int32_t kv_ar_len_{0};
-  int32_t kv_cache_len_{0};
-  int32_t vocab_size_;
-  int32_t bos_id_;
-  std::unordered_set<uint64_t> eos_id_;
-  const int32_t n_bos_;
-  const int32_t n_eos_;
-  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
+
+  int64_t cur_pos_{0};
+
   std::string tokenizer_path_;
   std::string performance_output_path_;
-  float logits_scale_;
-  int32_t logits_offset_;
   float temperature_;
-  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
-  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
-  Stats stats_;
-  std::unique_ptr<IoMgrBase> io_mgr_;
   EvalMode eval_mode_;
-  bool use_int64_token_{false};
-  std::string prefill_forward_name_;
-  std::string kv_forward_name_;
-  std::vector<std::string> method_names_;
   LlamaVersion llama_version_;
-  std::string kv_updater_;
-  int num_iters_;
-};
+  KVManagerMode kv_updater_;
+  std::unique_ptr<IMemAlloc> buffer_manager_;
+  std::unique_ptr<KVManager> kv_manager_;
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
+  std::unique_ptr<DecoderRunner> decoder_runner_;
+  std::unique_ptr<PromptProcessor> prompt_processor_;
+  std::unique_ptr<TokenGenerator> token_generator_;
 
+  // stats
+  executorch::llm::Stats stats_;
+};
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
new file mode 100644
index 00000000000..8d890637b13
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
+#include <numeric>
+using executorch::aten::TensorImpl;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+TokenGenerator::TokenGenerator(
+    tokenizers::Tokenizer* tokenizer,
+    DecoderRunner* decoder_runner,
+    KVManager* kv_manager,
+    const std::string& method_name,
+    std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
+    Metadata metadata,
+    executorch::llm::Stats* stats)
+    : tokenizer_(tokenizer),
+      decoder_runner_(decoder_runner),
+      kv_manager_(kv_manager),
+      method_name_(method_name),
+      eos_ids_(std::move(eos_ids)),
+      metadata_(metadata),
+      stats_(stats) {
+  k_cache_in_.resize(metadata_.num_layers);
+  v_cache_in_.resize(metadata_.num_layers);
+  k_cache_out_.resize(metadata_.num_layers);
+  v_cache_out_.resize(metadata_.num_layers);
+
+  // Calculate I/O size
+  input_toks_.size = metadata_.ar_len * sizeof(int64_t);
+  input_pos_.size = metadata_.ar_len * sizeof(int32_t);
+  attention_mask_.size =
+      metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
+  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t);
+}
+void TokenGenerator::init_io(
+    IMemAlloc* buffer_manager,
+    Result<MethodMeta> method_meta) {
+  input_tensors_.reserve(method_meta->num_inputs());
+  output_tensors_.reserve(method_meta->num_outputs());
+  // [I]: input_tokens
+  Result<TensorInfo> input_toks = method_meta->input_tensor_meta(0);
+  input_toks_.data =
+      reinterpret_cast<int64_t*>(buffer_manager->allocate(input_toks_.size));
+  input_toks_.tensor = std::make_unique<TensorImpl>(
+      input_toks->scalar_type(),
+      input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_toks->sizes().data()),
+      input_toks_.data,
+      const_cast<TensorImpl::DimOrderType*>(input_toks->dim_order().data()));
+  input_tensors_.emplace_back(input_toks_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_toks_.data, input_toks_.size, input_toks.get());
+
+  // [I]: attention_mask
+  Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(1);
+  attention_mask_.data = reinterpret_cast<uint16_t*>(
+      buffer_manager->allocate(attention_mask_.size));
+  attention_mask_.tensor = std::make_unique<TensorImpl>(
+      attention_mask->scalar_type(),
+      attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(attention_mask->sizes().data()),
+      attention_mask_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          attention_mask->dim_order().data()));
+  input_tensors_.emplace_back(attention_mask_.tensor.get());
+  buffer_manager->add_memory_info(
+      attention_mask_.data, attention_mask_.size, attention_mask.get());
+
+  // [I]: input_pos
+  Result<TensorInfo> input_pos = method_meta->input_tensor_meta(2);
+  input_pos_.data =
+      reinterpret_cast<int32_t*>(buffer_manager->allocate(input_pos_.size));
+  input_pos_.tensor = std::make_unique<TensorImpl>(
+      input_pos->scalar_type(),
+      input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+      input_pos_.data,
+      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+  input_tensors_.emplace_back(input_pos_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_pos_.data, input_pos_.size, input_pos.get());
+
+  // [I] kv_cache
+  int index = 3; // bypass input_tokens, atten_mask, input_pos
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::vector<std::unique_ptr<TensorImpl>>>& cache =
+        (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+    std::vector<std::vector<KVCache>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      for (int head = 0; head < metadata_.num_heads; ++head, ++index) {
+        Result<TensorInfo> kv_cache = method_meta->input_tensor_meta(index);
+
+        uint8_t* cache_ptr = cache_ptrs[layer][head].buffer;
+
+        cache[layer].emplace_back(std::make_unique<TensorImpl>(
+            kv_cache->scalar_type(),
+            kv_cache->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+            cache_ptr,
+            const_cast<TensorImpl::DimOrderType*>(
+                kv_cache->dim_order().data())));
+        input_tensors_.emplace_back(cache[layer][head].get());
+        buffer_manager->add_memory_info(
+            cache_ptr, cache[layer][head]->nbytes(), kv_cache.get());
+      }
+    }
+  }
+
+  // [O]: logits
+  Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+  logits_.data =
+      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
+  logits_.tensor = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      logits_.data,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_.emplace_back(logits_.tensor.get());
+  buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get());
+
+  // [O] kv_cache
+  index = 1;
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::vector<std::unique_ptr<TensorImpl>>>& cache =
+        (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+    std::vector<std::vector<KVCache>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      for (int head = 0; head < metadata_.num_heads; ++head, ++index) {
+        Result<TensorInfo> kv_cache = method_meta->output_tensor_meta(index);
+        uint8_t* cache_ptr = cache_ptrs[layer][head].output_buffer;
+        cache[layer].emplace_back(std::make_unique<TensorImpl>(
+            kv_cache->scalar_type(),
+            kv_cache->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+            cache_ptr,
+            const_cast<TensorImpl::DimOrderType*>(
+                kv_cache->dim_order().data())));
+        output_tensors_.emplace_back(cache[layer][head].get());
+        buffer_manager->add_memory_info(
+            cache_ptr, cache[layer][head]->nbytes(), kv_cache.get());
+      }
+    }
+  }
+  // Prepare the vector of EValue to run inference
+  inputs_.reserve(input_tensors_.size());
+  for (auto& input_tensor : input_tensors_) {
+    inputs_.emplace_back(std::move(input_tensor));
+  }
+}
+
+// This function only considers the case where token_generator_ar_len equals 1.
+void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) {
+  // update input_tok
+  *input_toks_.data =
+      metadata_.use_int64_token ? cur_token : static_cast<int32_t>(cur_token);
+  // update position_ids
+  *input_pos_.data = static_cast<int32_t>(start_pos);
+}
+
+Result<int64_t> TokenGenerator::generate(
+    std::vector<uint64_t> tokens,
+    int64_t start_pos,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback) {
+  ET_CHECK_MSG(
+      !tokens.empty(), "Token generation loop shouldn't take empty tokens");
+  int64_t pos = start_pos; // position in the sequence
+
+  // Token after prefill
+  uint64_t cur_token = tokens.back();
+  uint64_t prev_token;
+  // Rearrange KV cache first
+  kv_manager_->rearrange_cache(metadata_.ar_len);
+  std::vector<int32_t> attention_map(metadata_.ar_len);
+  std::iota(attention_map.begin(), attention_map.end(), -1);
+  // Initialize attention mask with current position
+  kv_manager_->init_attention_mask(
+      attention_mask_.data, attention_map, metadata_.ar_len, pos);
+  // Initialize the output of the module
+  ET_CHECK_MSG(
+      decoder_runner_->set_outputs(method_name_, output_tensors_) ==
+          executorch::runtime::Error::Ok,
+      "Failed to set output tensor for module %s",
+      method_name_.c_str());
+  // Generate our tokens
+  while (pos < seq_len - 1) {
+    // Fill in the token and position data
+    prepare_io(cur_token, pos);
+    // Only update data pointer of the cache to the tensor for SHIFT_POINTER
+    // mode
+    bool updated = kv_manager_->update_cache_tensor(
+        k_cache_in_,
+        k_cache_out_,
+        v_cache_in_,
+        v_cache_out_,
+        metadata_.ar_len,
+        pos);
+    // Only update the output of module for SHIFT_POINTER mode
+    if (updated) {
+      // Update the output of the module
+      ET_CHECK_MSG(
+          decoder_runner_->set_outputs(method_name_, output_tensors_) ==
+              executorch::runtime::Error::Ok,
+          "Failed to set output tensor for module %s",
+          method_name_.c_str());
+    }
+    // Run inference
+    auto logits_res = decoder_runner_->step(method_name_, inputs_);
+    ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
+    executorch::aten::Tensor& logits_tensor = logits_res.get();
+
+    prev_token = cur_token;
+
+    stats_->on_sampling_begin();
+    cur_token =
+        decoder_runner_->logits_to_token(logits_tensor, metadata_.ar_len);
+    stats_->on_sampling_end();
+
+    // Update KV Cache with the output results
+    kv_manager_->update_cache(metadata_.ar_len, pos, metadata_.ar_len);
+    // Update attention mask with current position
+    kv_manager_->update_attention_mask(
+        attention_mask_.data, metadata_.ar_len, pos, metadata_.ar_len);
+    pos++;
+
+    // print the token as string, decode it with the Tokenizer object
+    token_callback(
+        ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
+
+    // data-dependent terminating condition: we have n_eos_ number of EOS
+    if (eos_ids_->count(cur_token) > 0) {
+      printf("\n");
+      ET_LOG(Info, "\nReached to the end of generation");
+      break;
+    }
+  }
+  return pos - start_pos;
+}
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
new file mode 100644
index 00000000000..a5d69657955
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <pytorch/tokenizers/tokenizer.h>
+#include <string>
+
+namespace example {
+/**
+ * @class TokenGenerator
+ * @brief Class for generating the token using decoder and key-value manager.
+ */
+class TokenGenerator {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+  };
+  TokenGenerator(
+      tokenizers::Tokenizer* tokenizer,
+      DecoderRunner* decoder_runner,
+      KVManager* kv_manager,
+      const std::string& method_name,
+      std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
+      Metadata metadata,
+      executorch::llm::Stats* stats);
+  /**
+   * @brief Initialize I/O tensor and allocate I/O data buffer.
+   * @param buffer_manager Pointer to IMemAlloc instance which depends on
+   * kv_updater.
+   * @param method_meta Method metadata.
+   */
+  void init_io(
+      IMemAlloc* buffer_manager,
+      executorch::runtime::Result<executorch::runtime::MethodMeta> method_meta);
+
+  /**
+     * @brief Generate tokens.
+     * @param tokens Vector of input tokens.
+     * @param start_pos Starting position for generation.
+     * @param seq_len Length of the sequence to generate.
+     * @param token_callback Callback function for generated tokens.
+     * @return The number of tokens generated.
+     */
+  executorch::runtime::Result<int64_t> generate(
+      std::vector<uint64_t> tokens,
+      int64_t start_pos,
+      int32_t seq_len,
+      std::function<void(const std::string&)> token_callback);
+  inline const size_t total_token_generator_io_size_in_bytes() const {
+    return input_toks_.size + input_pos_.size + attention_mask_.size +
+        logits_.size;
+  }
+
+ private:
+  /**
+   * @brief Fill in I/O buffers with prompt token and position.
+   * @param cur_token Current token.
+   * @param start_pos Starting position.
+   */
+  void prepare_io(uint64_t cur_token, int64_t start_pos);
+
+  tokenizers::Tokenizer* tokenizer_;
+  DecoderRunner* decoder_runner_;
+  KVManager* kv_manager_;
+  std::string method_name_;
+  std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
+
+  // metadata
+  Metadata metadata_;
+
+  // inputs and outputs
+  TensorStruct<int64_t> input_toks_;
+  TensorStruct<int32_t> input_pos_;
+  TensorStruct<uint16_t> attention_mask_;
+  TensorStruct<uint16_t> logits_;
+
+  // layer -> head -> TensorImpl
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_in_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_in_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_out_;
+  std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_out_;
+
+  std::vector<executorch::runtime::EValue> inputs_;
+  std::vector<executorch::aten::Tensor> input_tensors_;
+  std::vector<executorch::aten::Tensor> output_tensors_;
+
+  // stats
+  executorch::llm::Stats* stats_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/utils.h
new file mode 100644
index 00000000000..5b20ba5d3d1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/utils.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstddef>
+#include <memory>
+
+// Template struct to hold tensor data and tensor
+template <typename T>
+struct TensorStruct {
+  std::unique_ptr<executorch::aten::TensorImpl> tensor;
+  T* data;
+  // data size in bytes
+  size_t size;
+};