QNN Llama Runner implement IRunner (pytorch#13171)

rohansjoshi · facebook-github-bot · commit 0fc31e24761b · 2025-08-07T14:34:43.000-07:00
Summary:

This PR makes the Runner for running Qualcomm LlamaModels implement the IRunner interface

Using this, enable running static Llama models inside LlamaDemo Android app

Switched default eval mode to hybrid everywhere

Differential Revision: D79759817
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -21,6 +21,9 @@ public class ModelUtils {
   // MediaTek
   static final int MEDIATEK_TEXT_MODEL = 3;
 
+  // QNN static llama
+  static final int QNN_TEXT_MODEL = 4;
+
   public static int getModelCategory(ModelType modelType, BackendType backendType) {
     if (backendType.equals(BackendType.XNNPACK)) {
       switch (modelType) {
@@ -35,6 +38,8 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)
       }
     } else if (backendType.equals(BackendType.MEDIATEK)) {
       return MEDIATEK_TEXT_MODEL;
+    } else if (backendType.equals(BackendType.QUALCOMM)) {
+      return QNN_TEXT_MODEL;
     }
 
     return TEXT_MODEL; // default
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -1038,7 +1038,7 @@ def _build_parser():
     parser.add_argument(
         "--model_mode",
         help="Export and inference kv mode, hybrid mode, or lookahead decoding mode",
-        default="kv",
+        default="hybrid",
         choices=["kv", "hybrid", "lookahead"],
         type=str,
     )
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -16,6 +16,7 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
+#include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/runtime/platform/log.h>
 #include <gflags/gflags.h>
 #include <fstream>
@@ -61,7 +62,7 @@ DEFINE_int32(
     "Total number of tokens to generate (prompt + output).");
 DEFINE_int32(
     eval_mode,
-    0,
+    1,
     "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv) / 2: Lookahead Decoding");
 DEFINE_string(
     kv_updater,
@@ -161,12 +162,13 @@ int main(int argc, char** argv) {
       buf.push_back(c);
     }
   };
-
+  executorch::extension::llm::GenerationConfig config{
+    true, -1, false, FLAGS_seq_len, static_cast<float>(FLAGS_temperature), 0, 0};
   if (use_tokenized_prompt) {
-    runner.generate(
+    runner.generate_from_prompt_or_file(
         FLAGS_tokenized_prompt.c_str(),
         use_tokenized_prompt,
-        FLAGS_seq_len,
+        config,
         callback);
   } else {
     // generate tokens & store inference output
@@ -175,10 +177,10 @@ int main(int argc, char** argv) {
         std::string formatted_prompt;
         formatted_prompt = get_formatted_prompt(
             prompt, FLAGS_system_prompt, decoder_model_version.get());
-        runner.generate(
+        runner.generate_from_prompt_or_file(
             formatted_prompt.c_str(),
             use_tokenized_prompt,
-            FLAGS_seq_len,
+            config,
             callback);
       }
     }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -59,7 +59,7 @@ void print_performance_report(
     outfile << num_tok;
     outfile.close();
   } else {
-    ET_CHECK_MSG(false, "Error saving the inference speed file");
+    ET_LOG(Error, "Error saving the inference speed file");
   }
 }
 
@@ -84,7 +84,7 @@ void save_logits(
 
 } // namespace
 
-std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer(
+std::unique_ptr<::tokenizers::Tokenizer> load_qnn_llama_tokenizer(
     const std::string& tokenizer_path,
     Version version) {
   auto special_tokens = get_special_tokens(version);
@@ -175,7 +175,7 @@ Error Runner::load() {
     eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]);
     eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
   } else {
-    tokenizer_ = load_llama_tokenizer(tokenizer_path_, Version::Default);
+    tokenizer_ = load_qnn_llama_tokenizer(tokenizer_path_, Version::Default);
     if (tokenizer_ == nullptr) {
       ET_LOG(
           Error, "Failed to load tokenizer with %s", tokenizer_path_.c_str());
@@ -313,13 +313,29 @@ Error Runner::load() {
 }
 
 Error Runner::generate(
+    const std::string& prompt,
+    const executorch::extension::llm::GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
+}
+
+Error Runner::generate_from_pos(
+    const std::string& prompt,
+    int64_t start_pos,
+    const executorch::extension::llm::GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  // TODO: currently only support start_pos == 0
+  return generate_tokenized_prompt_option(prompt, false, config, token_callback, stats_callback);
+}
+
+Error Runner::generate_from_prompt_or_file(
     const std::string& prompt,
     bool tokenized_prompt,
-    int32_t seq_len,
+    const executorch::extension::llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
-    bool echo,
-    bool warming) {
+    std::function<void(const Stats&)> stats_callback) {
   ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
   if (!is_loaded()) {
     stats_.model_load_start_ms = time_in_ms();
@@ -328,6 +344,7 @@ Error Runner::generate(
   }
   stats_.inference_start_ms = time_in_ms();
 
+  int32_t seq_len = config.seq_len;
   seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
   int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
 
@@ -366,7 +383,7 @@ Error Runner::generate(
       "sequence length exceeded - please increase the seq_len value");
 
   // Prompt Processor first
-  if (token_callback) {
+  if (token_callback && config.echo) {
     token_callback(prompt);
   }
   bool dump_logits = dump_logits_path_.empty() ? false : true;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -21,6 +21,7 @@
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
+#include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/module/module.h>
 #include <pytorch/tokenizers/tokenizer.h>
@@ -32,7 +33,7 @@ enum DecoderModelVersion {
   kLlama3,
   kQwen2_5,
 };
-class Runner {
+class Runner : public executorch::extension::llm::IRunner {
  public:
   explicit Runner(
       const std::string& decoder_model,
@@ -41,25 +42,34 @@ class Runner {
       const std::string& performance_output_path,
       const std::string& dump_logits_path,
       const float temperature = 0.8f,
-      const int eval_mode = EvalMode::kKVCached,
+      const int eval_mode = EvalMode::kHybrid,
       const std::string& kv_updater = "SmartMask",
       const int ngram = 0,
       const int window = 0,
       const int gcap = 0,
       std::unique_ptr<tokenizers::Tokenizer> tokenizer = nullptr);
 
-  bool is_loaded() const;
-  executorch::runtime::Error load();
+  bool is_loaded() const override;
+  executorch::runtime::Error load() override;
   // TODO: Support echo and warming
   executorch::runtime::Error generate(
+      const std::string& prompt,
+      const executorch::extension::llm::GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {}) override;
+  executorch::runtime::Error generate_from_pos(
+      const std::string& prompt,
+      int64_t start_pos,
+      const executorch::extension::llm::GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {}) override;
+  executorch::runtime::Error generate_from_prompt_or_file(
       const std::string& prompt,
       bool tokenized_prompt,
-      int32_t seq_len,
+      const executorch::extension::llm::GenerationConfig& config,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const executorch::llm::Stats&)> stats_callback = {},
-      bool echo = true,
-      bool warming = false);
-  void stop() {};
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {});
+  void stop() override {};
   executorch::runtime::Result<DecoderModelVersion> get_decoder_model_version();
 
  private:
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -166,6 +166,24 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner
   )
 
+  if (QNN_SDK_ROOT)
+    target_sources(
+      executorch_jni PRIVATE
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
+    )
+
+    target_include_directories(
+      executorch_jni PRIVATE
+      ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner
+    )
+  endif()
+
   if(NEURON_BUFFER_ALLOCATOR_LIB)
       target_sources(
       executorch_jni PRIVATE
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -15,6 +15,7 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/runtime/platform/log.h>
@@ -124,6 +125,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
   constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3;
+  constexpr static int MODEL_TYPE_QNN_LLAMA = 4;
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
@@ -174,6 +176,14 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           model_path->toStdString(),
           tokenizer_path->toStdString(),
           data_path_str);
+    } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
+      std::string decoder_model = "llama3"; // use llama3 for now
+      runner_ = std::make_unique<example::Runner>( // QNN runner
+          decoder_model.c_str(),
+          model_path->toStdString().c_str(),
+          tokenizer_path->toStdString().c_str(),
+          data_path->toStdString().c_str());
+      model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
 #if defined(EXECUTORCH_BUILD_MEDIATEK)
     } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
       runner_ = std::make_unique<MTKLlamaRunner>(
@@ -318,6 +328,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           [callback](std::string result) { callback->onResult(result); },
           [callback](const llm::Stats& stats) { callback->onStats(stats); }));
     }
+    return static_cast<jint>(executorch::runtime::Error::InvalidArgument);
   }
 
   void stop() {

Original file line number	Diff line number	Diff line change
`@@ -1038,7 +1038,7 @@ def _build_parser():`
`1038`	`1038`	`parser.add_argument(`
`1039`	`1039`	`"--model_mode",`
`1040`	`1040`	`help="Export and inference kv mode, hybrid mode, or lookahead decoding mode",`
`1041`		`- default="kv",`
	`1041`	`+ default="hybrid",`
`1042`	`1042`	`choices=["kv", "hybrid", "lookahead"],`
`1043`	`1043`	`type=str,`
`1044`	`1044`	`)`