From 05233198cd4b63274b58b720043ebd4e05e3d338 Mon Sep 17 00:00:00 2001 From: Limin Tang Date: Tue, 8 Jul 2025 18:13:34 -0700 Subject: [PATCH] Add a parameter to pass tokenizer to llama QNN runner (#12285) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/12285 Reviewed By: billmguo, cccclai Differential Revision: D77910880 --- .../oss_scripts/llama/runner/runner.cpp | 64 +++++++++++-------- .../oss_scripts/llama/runner/runner.h | 4 +- .../qualcomm/oss_scripts/llama/targets.bzl | 2 + 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 7a054d8e2ab..30235332ebd 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -61,14 +61,16 @@ Runner::Runner( const std::string& kv_updater, const int ngram, const int window, - const int gcap) - : tokenizer_path_(tokenizer_path), + const int gcap, + std::unique_ptr tokenizer) + : ngram_(ngram), + window_(window), + gcap_(gcap), + tokenizer_path_(tokenizer_path), performance_output_path_(performance_output_path), temperature_(temperature), eval_mode_(static_cast(eval_mode)), - ngram_(ngram), - window_(window), - gcap_(gcap) { + tokenizer_(std::move(tokenizer)) { module_ = std::make_unique( model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); stats_.reset(); @@ -115,30 +117,40 @@ Error Runner::load() { break; } - // load tokenizer. Assuming tiktoken is the default tokenizer - tokenizer_ = get_tiktoken_for_llama(); - auto err = tokenizer_->load(tokenizer_path_); auto eos_ids = std::make_unique>(); - // Rely on tiktoken to throw error if the artifact is incompatible. Then we - // fallback to BPE tokenizer. - if (err != tokenizers::Error::Ok) { - ET_LOG( - Info, - "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer", - tokenizer_path_.c_str()); - tokenizer_.reset(); - tokenizer_ = std::make_unique(); - err = tokenizer_->load(tokenizer_path_); - llama_version_ = LlamaVersion::kLlama2; - ET_CHECK_MSG( - err == tokenizers::Error::Ok, - "failed to load tokenizer %s", - tokenizer_path_.c_str()); - } else { + // TODO: remove this once we could release the new tokens used for the + // tokenizer + if (tokenizer_ != nullptr) { eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); - llama_version_ = LlamaVersion::kLlama3; + eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]); + eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]); + } else { + // load tokenizer. Assuming tiktoken is the default tokenizer + tokenizer_ = get_tiktoken_for_llama(); + auto err = tokenizer_->load(tokenizer_path_); + auto eos_ids = std::make_unique>(); + // Rely on tiktoken to throw error if the artifact is incompatible. Then we + // fallback to BPE tokenizer. + if (err != tokenizers::Error::Ok) { + ET_LOG( + Info, + "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer", + tokenizer_path_.c_str()); + tokenizer_.reset(); + tokenizer_ = std::make_unique(); + err = tokenizer_->load(tokenizer_path_); + llama_version_ = LlamaVersion::kLlama2; + ET_CHECK_MSG( + err == tokenizers::Error::Ok, + "failed to load tokenizer %s", + tokenizer_path_.c_str()); + } else { + eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + llama_version_ = LlamaVersion::kLlama3; + } + eos_ids->insert(tokenizer_->eos_tok()); } - eos_ids->insert(tokenizer_->eos_tok()); + int32_t vocab_size = tokenizer_->vocab_size(); decoder_runner_ = std::make_unique(module_.get(), vocab_size, temperature_); diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index c318da50205..ec53e7463f6 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -24,6 +24,7 @@ #include #include #include + namespace example { enum LlamaVersion { @@ -41,7 +42,8 @@ class Runner { const std::string& kv_updater = "SmartMask", const int ngram = 0, const int window = 0, - const int gcap = 0); + const int gcap = 0, + std::unique_ptr tokenizer = nullptr); bool is_loaded() const; executorch::runtime::Error load(); diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl index cc64a7c3c7b..81e9f394565 100644 --- a/examples/qualcomm/oss_scripts/llama/targets.bzl +++ b/examples/qualcomm/oss_scripts/llama/targets.bzl @@ -33,6 +33,8 @@ def define_common_targets(): "//executorch/extension/evalue_util:print_evalue", "//executorch/backends/qualcomm/runtime:runtime", "//pytorch/tokenizers:llama2c_tokenizer", + "//pytorch/tokenizers:regex_lookahead", + "//pytorch/tokenizers:tiktoken", ], external_deps = [ "gflags",