From 05233198cd4b63274b58b720043ebd4e05e3d338 Mon Sep 17 00:00:00 2001
From: Limin Tang <limintang@meta.com>
Date: Tue, 8 Jul 2025 18:13:34 -0700
Subject: [PATCH] Add a parameter to pass tokenizer to llama QNN runner
 (#12285)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/12285

Reviewed By: billmguo, cccclai

Differential Revision: D77910880
---
 .../oss_scripts/llama/runner/runner.cpp       | 64 +++++++++++--------
 .../oss_scripts/llama/runner/runner.h         |  4 +-
 .../qualcomm/oss_scripts/llama/targets.bzl    |  2 +
 3 files changed, 43 insertions(+), 27 deletions(-)
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 7a054d8e2ab..30235332ebd 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -61,14 +61,16 @@ Runner::Runner(
     const std::string& kv_updater,
     const int ngram,
     const int window,
-    const int gcap)
-    : tokenizer_path_(tokenizer_path),
+    const int gcap,
+    std::unique_ptr<tokenizers::Tokenizer> tokenizer)
+    : ngram_(ngram),
+      window_(window),
+      gcap_(gcap),
+      tokenizer_path_(tokenizer_path),
       performance_output_path_(performance_output_path),
       temperature_(temperature),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
-      ngram_(ngram),
-      window_(window),
-      gcap_(gcap) {
+      tokenizer_(std::move(tokenizer)) {
   module_ = std::make_unique<Module>(
       model_path, Module::LoadMode::MmapUseMlockIgnoreErrors);
   stats_.reset();
@@ -115,30 +117,40 @@ Error Runner::load() {
       break;
   }
 
-  // load tokenizer. Assuming tiktoken is the default tokenizer
-  tokenizer_ = get_tiktoken_for_llama();
-  auto err = tokenizer_->load(tokenizer_path_);
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  // Rely on tiktoken to throw error if the artifact is incompatible. Then we
-  // fallback to BPE tokenizer.
-  if (err != tokenizers::Error::Ok) {
-    ET_LOG(
-        Info,
-        "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
-        tokenizer_path_.c_str());
-    tokenizer_.reset();
-    tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
-    err = tokenizer_->load(tokenizer_path_);
-    llama_version_ = LlamaVersion::kLlama2;
-    ET_CHECK_MSG(
-        err == tokenizers::Error::Ok,
-        "failed to load tokenizer %s",
-        tokenizer_path_.c_str());
-  } else {
+  // TODO: remove this once we could release the new tokens used for the
+  // tokenizer
+  if (tokenizer_ != nullptr) {
     eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
-    llama_version_ = LlamaVersion::kLlama3;
+    eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]);
+    eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
+  } else {
+    // load tokenizer. Assuming tiktoken is the default tokenizer
+    tokenizer_ = get_tiktoken_for_llama();
+    auto err = tokenizer_->load(tokenizer_path_);
+    auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+    // Rely on tiktoken to throw error if the artifact is incompatible. Then we
+    // fallback to BPE tokenizer.
+    if (err != tokenizers::Error::Ok) {
+      ET_LOG(
+          Info,
+          "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
+          tokenizer_path_.c_str());
+      tokenizer_.reset();
+      tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
+      err = tokenizer_->load(tokenizer_path_);
+      llama_version_ = LlamaVersion::kLlama2;
+      ET_CHECK_MSG(
+          err == tokenizers::Error::Ok,
+          "failed to load tokenizer %s",
+          tokenizer_path_.c_str());
+    } else {
+      eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+      llama_version_ = LlamaVersion::kLlama3;
+    }
+    eos_ids->insert(tokenizer_->eos_tok());
   }
-  eos_ids->insert(tokenizer_->eos_tok());
+
   int32_t vocab_size = tokenizer_->vocab_size();
   decoder_runner_ =
       std::make_unique<DecoderRunner>(module_.get(), vocab_size, temperature_);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index c318da50205..ec53e7463f6 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -24,6 +24,7 @@
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/module/module.h>
 #include <pytorch/tokenizers/tokenizer.h>
+
 namespace example {
 
 enum LlamaVersion {
@@ -41,7 +42,8 @@ class Runner {
       const std::string& kv_updater = "SmartMask",
       const int ngram = 0,
       const int window = 0,
-      const int gcap = 0);
+      const int gcap = 0,
+      std::unique_ptr<tokenizers::Tokenizer> tokenizer = nullptr);
 
   bool is_loaded() const;
   executorch::runtime::Error load();
diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl
index cc64a7c3c7b..81e9f394565 100644
--- a/examples/qualcomm/oss_scripts/llama/targets.bzl
+++ b/examples/qualcomm/oss_scripts/llama/targets.bzl
@@ -33,6 +33,8 @@ def define_common_targets():
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/backends/qualcomm/runtime:runtime",
             "//pytorch/tokenizers:llama2c_tokenizer",
+            "//pytorch/tokenizers:regex_lookahead",
+            "//pytorch/tokenizers:tiktoken",
         ],
         external_deps = [
             "gflags",