Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ public class ModelUtils {
// MediaTek
static final int MEDIATEK_TEXT_MODEL = 3;

// QNN static llama
static final int QNN_TEXT_MODEL = 4;

public static int getModelCategory(ModelType modelType, BackendType backendType) {
if (backendType.equals(BackendType.XNNPACK)) {
switch (modelType) {
Expand All @@ -35,6 +38,8 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)
}
} else if (backendType.equals(BackendType.MEDIATEK)) {
return MEDIATEK_TEXT_MODEL;
} else if (backendType.equals(BackendType.QUALCOMM)) {
return QNN_TEXT_MODEL;
}

return TEXT_MODEL; // default
Expand Down
4 changes: 3 additions & 1 deletion examples/models/llama/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>

#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <pytorch/tokenizers/tokenizer.h>
Expand All @@ -33,6 +34,7 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
float temperature = -1.0f);

std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
const std::string& tokenizer_path);
const std::string& tokenizer_path,
Version version = Version::Default);

} // namespace example
2 changes: 2 additions & 0 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ list(
${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h
${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h
${EXECUTORCH_SOURCE_DIR}/examples/models/llama/runner/runner.cpp
${EXECUTORCH_SOURCE_DIR}/examples/models/llama/runner/runner.h
)

list(APPEND _llama_runner__srcs)
Expand Down
26 changes: 14 additions & 12 deletions examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/runtime/platform/log.h>
#include <gflags/gflags.h>
#include <fstream>
Expand Down Expand Up @@ -61,7 +62,7 @@ DEFINE_int32(
"Total number of tokens to generate (prompt + output).");
DEFINE_int32(
eval_mode,
0,
1,
"0: TokenGenerator(kv) / 1: HybridMode (prefill+kv) / 2: Lookahead Decoding");
DEFINE_string(
kv_updater,
Expand Down Expand Up @@ -172,25 +173,26 @@ void start_runner(
buf.push_back(c);
}
};

executorch::extension::llm::GenerationConfig config{
true,
-1,
false,
FLAGS_seq_len,
static_cast<float>(FLAGS_temperature),
0,
0};
if (use_tokenized_prompt) {
runner.generate(
FLAGS_tokenized_prompt.c_str(),
use_tokenized_prompt,
FLAGS_seq_len,
callback);
runner.generate_from_prompt_or_file(
FLAGS_tokenized_prompt.c_str(), use_tokenized_prompt, config, callback);
} else {
// generate tokens & store inference output
for (int i = 0; i < FLAGS_num_iters; i++) {
for (const auto& prompt : prompts) {
std::string formatted_prompt;
formatted_prompt = get_formatted_prompt(
prompt, FLAGS_system_prompt, decoder_model_version.get());
runner.generate(
formatted_prompt.c_str(),
use_tokenized_prompt,
FLAGS_seq_len,
callback);
runner.generate_from_prompt_or_file(
formatted_prompt.c_str(), use_tokenized_prompt, config, callback);
}
}
}
Expand Down
43 changes: 29 additions & 14 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// A llama 3.2 runner that includes preprocessing and post processing
// logic. The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama/runner/runner.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need this particular runner?

Copy link
Contributor Author

@rohansjoshi rohansjoshi Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm only using the function load_llama_tokenizer from executorch/examples/models/llama/runner/runner, not the runner there. I'm trying to reuse code from examples/models

#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/client_mem.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h>
Expand Down Expand Up @@ -58,7 +59,7 @@ void print_performance_report(
outfile << num_tok;
outfile.close();
} else {
ET_CHECK_MSG(false, "Error saving the inference speed file");
ET_LOG(Error, "Error saving the inference speed file");
}
}

Expand All @@ -83,13 +84,6 @@ void save_logits(

} // namespace

std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer(
const std::string& tokenizer_path,
Version version) {
auto special_tokens = get_special_tokens(version);
return llm::load_tokenizer(tokenizer_path, std::move(special_tokens));
}

template <typename T>
Runner<T>::Runner(
std::unique_ptr<executorch::extension::Module> module,
Expand Down Expand Up @@ -181,7 +175,8 @@ Error Runner<T>::load() {
eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]);
eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
} else {
tokenizer_ = load_llama_tokenizer(tokenizer_path_, Version::Default);
tokenizer_ =
example::load_llama_tokenizer(tokenizer_path_, Version::Default);
if (tokenizer_ == nullptr) {
ET_LOG(
Error, "Failed to load tokenizer with %s", tokenizer_path_.c_str());
Expand Down Expand Up @@ -323,13 +318,32 @@ Error Runner<T>::load() {

template <typename T>
Error Runner<T>::generate(
const std::string& prompt,
const llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback) {
return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
}

template <typename T>
Error Runner<T>::generate_from_pos(
const std::string& prompt,
int64_t start_pos,
const llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback) {
// TODO: currently only support start_pos == 0
return generate_from_prompt_or_file(
prompt, false, config, token_callback, stats_callback);
}

template <typename T>
Error Runner<T>::generate_from_prompt_or_file(
const std::string& prompt,
bool tokenized_prompt,
int32_t seq_len,
const llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback,
bool echo,
bool warming) {
std::function<void(const Stats&)> stats_callback) {
ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
if (!is_loaded()) {
stats_.model_load_start_ms = time_in_ms();
Expand All @@ -338,6 +352,7 @@ Error Runner<T>::generate(
}
stats_.inference_start_ms = time_in_ms();

int32_t seq_len = config.seq_len;
seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;

Expand Down Expand Up @@ -376,7 +391,7 @@ Error Runner<T>::generate(
"sequence length exceeded - please increase the seq_len value");

// Prompt Processor first
if (token_callback) {
if (token_callback && config.echo) {
token_callback(prompt);
}
bool dump_logits = dump_logits_path_.empty() ? false : true;
Expand Down
30 changes: 21 additions & 9 deletions examples/qualcomm/oss_scripts/llama/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/extension/llm/runner/stats.h>
#include <executorch/extension/module/module.h>
#include <pytorch/tokenizers/tokenizer.h>
Expand All @@ -41,7 +42,7 @@ enum KvBitWidth {
};

template <typename T>
class Runner {
class Runner : public executorch::extension::llm::IRunner {
public:
explicit Runner(
std::unique_ptr<executorch::extension::Module> module,
Expand All @@ -51,25 +52,36 @@ class Runner {
const std::string& performance_output_path,
const std::string& dump_logits_path,
const float temperature = 0.8f,
const int eval_mode = EvalMode::kKVCached,
const int eval_mode = EvalMode::kHybrid,
const std::string& kv_updater = "SmartMask",
const int ngram = 0,
const int window = 0,
const int gcap = 0,
std::unique_ptr<tokenizers::Tokenizer> tokenizer = nullptr);

bool is_loaded() const;
executorch::runtime::Error load();
bool is_loaded() const override;
executorch::runtime::Error load() override;
// TODO: Support echo and warming
executorch::runtime::Error generate(
const std::string& prompt,
const executorch::extension::llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const executorch::llm::Stats&)> stats_callback = {})
override;
executorch::runtime::Error generate_from_pos(
const std::string& prompt,
int64_t start_pos,
const executorch::extension::llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const executorch::llm::Stats&)> stats_callback = {})
override;
executorch::runtime::Error generate_from_prompt_or_file(
const std::string& prompt,
bool tokenized_prompt,
int32_t seq_len,
const executorch::extension::llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const executorch::llm::Stats&)> stats_callback = {},
bool echo = true,
bool warming = false);
void stop() {};
std::function<void(const executorch::llm::Stats&)> stats_callback = {});
void stop() override {};
executorch::runtime::Result<DecoderModelVersion> get_decoder_model_version();

private:
Expand Down
1 change: 1 addition & 0 deletions examples/qualcomm/oss_scripts/llama/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def define_common_targets():
exported_deps = [
"//executorch/extension/module:module",
"//executorch/extension/llm/sampler:sampler",
"//executorch/examples/models/llama/runner:runner",
"//executorch/examples/models/llama/tokenizer:tiktoken",
"//executorch/extension/evalue_util:print_evalue",
"//executorch/backends/qualcomm/runtime:runtime",
Expand Down
29 changes: 29 additions & 0 deletions extension/android/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,35 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner
)

target_sources(
executorch_jni
PRIVATE ${EXECUTORCH_ROOT}/extension/llm/runner/llm_runner_helper.cpp
)

target_include_directories(
executorch_jni PRIVATE ${EXECUTORCH_ROOT}/extension/llm/runner
)

if(QNN_SDK_ROOT)
target_sources(
executorch_jni
PRIVATE
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
)

target_include_directories(
executorch_jni
PRIVATE ${EXECUTORCH_ROOT}/examples/qualcomm/oss_scripts/llama/runner
)
target_compile_definitions(executorch_jni PRIVATE EXECUTORCH_BUILD_QNN=1)
endif()

if(NEURON_BUFFER_ALLOCATOR_LIB)
target_sources(
executorch_jni
Expand Down
23 changes: 23 additions & 0 deletions extension/android/jni/jni_layer_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <executorch/examples/models/llama/runner/runner.h>
#include <executorch/examples/models/llava/runner/llava_runner.h>
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
#include <executorch/extension/llm/runner/image.h>
#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/runtime/platform/log.h>
Expand All @@ -29,6 +30,10 @@
#include <fbjni/ByteBuffer.h>
#include <fbjni/fbjni.h>

#if defined(EXECUTORCH_BUILD_QNN)
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
#endif

#if defined(EXECUTORCH_BUILD_MEDIATEK)
#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
#endif
Expand Down Expand Up @@ -124,6 +129,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3;
constexpr static int MODEL_TYPE_QNN_LLAMA = 4;

static facebook::jni::local_ref<jhybriddata> initHybrid(
facebook::jni::alias_ref<jclass>,
Expand Down Expand Up @@ -174,6 +180,22 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
model_path->toStdString(),
tokenizer_path->toStdString(),
data_path_str);
#if defined(EXECUTORCH_BUILD_QNN)
} else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
std::unique_ptr<executorch::extension::Module> module = std::make_unique<
executorch::extension::Module>(
model_path->toStdString().c_str(),
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
std::string decoder_model = "llama3"; // use llama3 for now
runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
std::move(module),
decoder_model.c_str(),
model_path->toStdString().c_str(),
tokenizer_path->toStdString().c_str(),
data_path->toStdString().c_str(),
"");
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
#endif
#if defined(EXECUTORCH_BUILD_MEDIATEK)
} else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
runner_ = std::make_unique<MTKLlamaRunner>(
Expand Down Expand Up @@ -318,6 +340,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
[callback](std::string result) { callback->onResult(result); },
[callback](const llm::Stats& stats) { callback->onStats(stats); }));
}
return static_cast<jint>(executorch::runtime::Error::InvalidArgument);
}

void stop() {
Expand Down
Loading