From 0d556523328dd848441ffb140a577ba027ad5b9b Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 4 Sep 2024 14:34:59 -0700 Subject: [PATCH 01/22] Update Android build for MTK 1. MTK requires rtti and exceptions 2. MTK requires Android 26+ --- backends/mediatek/CMakeLists.txt | 1 + build/build_android_llm_demo.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 4b233d94f04..60c08fe8757 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -25,6 +25,7 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) +target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions") target_link_libraries( neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log ${NEURON_BUFFER_ALLOCATOR_LIB} diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 7b7150de210..389c6d95172 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -23,7 +23,7 @@ build_android_native_library() { cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ - -DANDROID_PLATFORM=android-23 \ + -DANDROID_PLATFORM=android-26 \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DEXECUTORCH_BUILD_XNNPACK=ON \ From 4ce04e8685687d847c99b558c18eb39a1a9b78c7 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 4 Sep 2024 14:44:57 -0700 Subject: [PATCH 02/22] Install neuron_backend in executorch-config --- build/executorch-config.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 4376c9e5e77..c40f214133a 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -41,6 +41,7 @@ set(lib_list ${FLATCCRT_LIB} coremldelegate mpsdelegate + neuron_backend qnn_executorch_backend portable_ops_lib extension_module From 902569c21b83322efe100ee4dde061d78ab26edd Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Thu, 29 Aug 2024 14:35:49 -0700 Subject: [PATCH 03/22] Create a MTK Runner to be able to run with a mobile app --- .../llm_helper/include/llama_runner_values.h | 40 +++ .../executor_runner/mtk_llama_runner.cpp | 326 ++++++++++++++++++ 2 files changed, 366 insertions(+) create mode 100644 examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.cpp diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h new file mode 100644 index 00000000000..b117d44e1f2 --- /dev/null +++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h @@ -0,0 +1,40 @@ +#pragma once + +namespace torch::executor { + using llm_helper::LLMType; + + // Sizes + const size_t PROMPT_TOKEN_BATCH_SIZE = 128; + const size_t CACHE_SIZE = 512; + const size_t HIDDEN_SIZE = 4096; + const size_t NUM_HEAD = 32; + const size_t NUM_LAYER = 32; + const size_t MAX_TOKEN_LENGTH = 8192; + const double ROT_EMB_BASE = 500000; + + // Types + const LLMType MODEL_INPUT_TYPE = LLMType::FP32; + const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32; + const LLMType CACHE_TYPE = LLMType::FP32; + const LLMType MASK_TYPE = LLMType::FP32; + const LLMType ROT_EMB_TYPE = LLMType::FP32; + + // Paths + const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model" + const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin" + + // Comma-Separated Paths + const std::string PROMPT_MODEL_PATHS="\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte," + + // Comma-Separated Paths + const std::string GEN_MODEL_PATHS="\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\ + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte," + +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp new file mode 100644 index 00000000000..c74ac65ce1c --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 MediaTek Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* Copyright Statement: + * + * This software/firmware and related documentation ("MediaTek Software") are + * protected under relevant copyright laws. The information contained herein + * is confidential and proprietary to MediaTek Inc. and/or its licensors. + * Without the prior written permission of MediaTek inc. and/or its licensors, + * any reproduction, modification, use or disclosure of MediaTek Software, + * and information contained herein, in whole or in part, shall be strictly + * prohibited. + */ +/* MediaTek Inc. (C) 2024. All rights reserved. + * + * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES + * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE") + * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON + * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. + * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE + * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR + * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH + * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY + * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY + * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK + * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO + * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN + * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND + * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER + * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT + * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER + * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE. + * + * The following software/firmware and/or related documentation ("MediaTek + * Software") have been modified by MediaTek Inc. All revisions are subject to + * any receiver's applicable license agreements with MediaTek Inc. + */ + +#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h" +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama_runner/LlamaConfig.h" +#include "llama_runner/LlamaRuntime.h" +#include "llama_runner/ModelChunk.h" +#include "llama_runner/Utils.h" +#include "llama_runner/llm_helper/include/llm_types.h" +#include "llama_runner/llm_helper/include/llama_runner_values.h" + +#include +#include + +// Global BOS and EOS option for tokenization (encoding) +static constexpr int8_t kAddBos = 1; +static constexpr int8_t kAddEos = 0; + +using namespace torch::executor; +using namespace torch::executor::llm_helper; +using torch::executor::utils::Timer; + +Runner::MTKLlamaRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature) + : modeloptions_(get_model_options()), + modelpaths_(get_model_paths()) { + ET_LOG( + Info, + "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files."); +} + +Error Runner::load() { + if (is_loaded()) { + return Error::Ok; + } + + // Load tokenizer + ET_LOG(Info, "Loading tokenizer."); + tokenizer_ = load_tokenizer(); + + // Load prompt model + ET_LOG(Info, "Loading prompt model."); + runtime_->Initialize(modeloptions_, modelpaths_); +} + +bool Runner::is_loaded() const { + return tokenizer_ && runtime_; +} + +Error Runner::generate( + const std::string& prompt, + int32_t seq_len, + std::function token_callback, + std::function stats_callback) { + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + // Wrap the token_callback with print function + std::function wrapped_callback = + [token_callback](const std::string& piece) { + util::safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) { + token_callback(piece); + } + }; + + ET_LOG(Info, "Starting inference."); + inference(runtime_, tokenizer_, prompt, wrapped_callback); +} + +void Runner::stop() { + if (is_loaded()) { + runtime_->Release(); + } else { + ET_LOG(Error, "Llama Runtime is not loaded, cannot stop"); + } +} + +LlamaModelOptions get_model_options() { + LlamaModelOptions options = { + // Sizes + .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE, + .cache_size = CACHE_SIZE, + .hidden_size = HIDDEN_SIZE, + .num_head = NUM_HEAD, + .num_layer = NUM_LAYER, + .max_token_length = MAX_TOKEN_LENGTH, + .rot_emb_base = ROT_EMB_BASE, + + // Types + .model_input_type = MODEL_INPUT_TYPE, + .model_output_type = MODEL_OUTPUT_TYPE, + .cache_type = CACHE_TYPE, + .mask_type = MASK_TYPE, + .rot_emb_type = ROT_EMB_TYPE}; + return options; +} + +LlamaModelPaths get_model_paths() { + LlamaModelPaths model_paths = { + .tokenizer_path = TOKENIZER_PATH, + .token_embedding_path = TOKEN_EMBEDDING_PATH, + .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','), + .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')}; + return model_paths; +} + +Result digest_prompt( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::vector input_tokens) { + const auto input_token_count = input_tokens.size(); + const auto prompt_token_batch_size = llama_runtime.GetTokenBatchSize(); + size_t cur_token_index = 0; + + Timer timer_digest_prompt([=](const auto elapsed_sec) { + // Ideal prompt size is a multiple of prompt batch size + const size_t ideal_prompt_size = + std::ceil(float(input_token_count) / prompt_token_batch_size) * + prompt_token_batch_size; + ET_LOG( + Info, + "Done analyzing prompt in %f sec (%f tok/s)", + elapsed_sec, + (float)ideal_prompt_size / elapsed_sec); + }); + + auto getNextTokens = [&]() { + const size_t num_tok_remain = input_token_count - cur_token_index; + const size_t remainder = num_tok_remain % prompt_token_batch_size; + const size_t num_new_tokens = + remainder ? remainder : prompt_token_batch_size; + const auto start = cur_token_index; + const auto end = start + num_new_tokens; + return std::vector( + input_tokens.begin() + start, input_tokens.begin() + end); + }; + + void* logits; + timer_digest_prompt.Start(); + while (cur_token_index < input_token_count) { + const auto next_tokens = getNextTokens(); + ET_LOG( + Debug, + "Digest next tokens (size=%zu), 1st tok=%lu", + next_tokens.size(), + next_tokens[0]); + logits = llama_runtime.Run(next_tokens); + cur_token_index += next_tokens.size(); + } + timer_digest_prompt.End(); + + const auto vocab_size = tokenizer->vocab_size(); + const auto logits_type = llama_runtime.GetModelOptions().model_output_type; + const auto first_output_token = + utils::argmax(logits_type, logits, vocab_size); + return first_output_token; +} + +Error gen_response( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const uint64_t input_token, + std::function token_callback) { + Timer timer_model_swap( + [](const auto elapsed_sec) { ET_LOG(Info, "Model swapped."); }); + + // Swap to gen mode + timer_model_swap.Start(); + llama_runtime.SwapModel(1); + timer_model_swap.End(); + + size_t gen_tok_count = 0; + uint64_t prev_token = input_token; + uint64_t output_token = input_token; + + auto decode_res = tokenizer->decode(prev_token, output_token); + ET_CHECK_OR_RETURN_ERROR( + decode_res.ok(), + InvalidState, + "Tokenizer failed to decode first generated token: %lu", + output_token); + std::string full_response = std::move(decode_res.get()); + std::vector full_response_tokens = {input_token}; + + const auto vocab_size = tokenizer->vocab_size(); + const auto logits_type = llama_runtime.GetModelOptions().model_output_type; + + double gen_total_time_sec = 0; + Timer timer_gen_token( + [&](const auto elapsed_sec) { gen_total_time_sec += elapsed_sec; }); + + // Print first output token + token_callback(full_response); + + while (gen_tok_count++ < model_options_.max_response && + llama_runtime.GetTokenIndex() < model_options_.max_token_length) { + timer_gen_token.Start(); + void* logits = llama_runtime.Run({output_token}); + timer_gen_token.End(); + + prev_token = output_token; + output_token = utils::argmax(logits_type, logits, vocab_size); + full_response_tokens.push_back(output_token); + + // Stop when output is EOS + if (output_token == tokenizer->eos_tok()) { + token_callback(""); + break; + } + auto decode_res = tokenizer->decode(prev_token, output_token); + ET_CHECK_OR_RETURN_ERROR( + decode_res.ok(), + InvalidState, + "Tokenizer failed to decode generated token %lu", + output_token); + const std::string tok_str = std::move(decode_res.get()); + full_response += tok_str; + token_callback(tok_str); + } + + std::cout << "\n\n[Generated Tokens]\n" + << utils::to_string(full_response_tokens) << std::endl; + + ET_LOG( + Info, + "Token generation speed: %f tok/s", + gen_tok_count / gen_total_time_sec); + + return Error::Ok; +} + +Error inference( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::string& prompt + std::function token_callback) { + // Tokenize input prompt + auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos); + ET_CHECK_OR_RETURN_ERROR( + encode_res.ok(), InvalidState, "Tokenizer failed to encode prompt"); + const auto input_tokens = std::move(encode_res.get()); + + // Run prompt mode (pre-fill) + auto prefill_res = digest_prompt(llama_runtime, tokenizer, input_tokens); + ET_CHECK_OR_RETURN_ERROR( + prefill_res.ok(), InvalidState, "Failed to digest prompt"); + const auto first_output_token = prefill_res.get(); + + // run generation mode (decoding) + return gen_response(llama_runtime, tokenizer, first_output_token, token_callback); +} + +std::unique_ptr load_tokenizer() { + std::unique_ptr tokenizer; + // Assumes that tokenizer type is Tiktoken + tokenizer = std::make_unique(); + tokenizer->load(modelpaths_.tokenizer_path); + return tokenizer; +} From b21f1f605072d116aeee902b4bd318f2192d46b4 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 4 Sep 2024 14:53:32 -0700 Subject: [PATCH 04/22] TEST ONLY try to build mtk stuff --- build/build_android_llm_demo.sh | 6 ++++-- examples/mediatek/executor_runner/mtk_llama_runner.cpp | 4 ++-- extension/android/CMakeLists.txt | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 389c6d95172..66752092625 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -34,6 +34,8 @@ build_android_native_library() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_NEURON=ON \ + -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" @@ -47,7 +49,7 @@ build_android_native_library() { cmake examples/models/llama2 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ + -DANDROID_PLATFORM=android-26 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -61,7 +63,7 @@ build_android_native_library() { cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ - -DANDROID_PLATFORM=android-23 \ + -DANDROID_PLATFORM=android-26 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index c74ac65ce1c..81de2bb8415 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -45,7 +45,7 @@ */ #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h" -#include +// #include #include #include @@ -59,7 +59,7 @@ #include #include #include -#include +// #include #include #include diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 6827ae79040..23f5ac631bc 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -41,6 +41,7 @@ list( extension_runner_util extension_threadpool fbjni + neuron_backend ) if(TARGET optimized_native_cpu_ops_lib) @@ -100,7 +101,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) target_link_options_shared_lib(quantized_ops_lib) - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) + set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp) add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS}) if(TARGET pthreadpool) target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1) From 72428f583d4b13acc65d223bc0b9b4f36a25e84a Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:16:32 -0700 Subject: [PATCH 05/22] Fix Runner compilation errors --- backends/mediatek/runtime/include/NeuronLog.h | 2 +- .../llm_helper/include/llama_runner_values.h | 8 +-- .../executor_runner/mtk_llama_runner.cpp | 41 +++++------ .../executor_runner/mtk_llama_runner.h | 69 +++++++++++++++++++ 4 files changed, 93 insertions(+), 27 deletions(-) create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.h diff --git a/backends/mediatek/runtime/include/NeuronLog.h b/backends/mediatek/runtime/include/NeuronLog.h index ccf8b24870d..5367a91ac4e 100644 --- a/backends/mediatek/runtime/include/NeuronLog.h +++ b/backends/mediatek/runtime/include/NeuronLog.h @@ -8,7 +8,7 @@ #pragma once -#include +#include "api/NeuronAdapter.h" #include #include diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h index b117d44e1f2..aeff7254de1 100644 --- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h +++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h @@ -20,21 +20,21 @@ namespace torch::executor { const LLMType ROT_EMB_TYPE = LLMType::FP32; // Paths - const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model" - const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin" + const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model"; + const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin"; // Comma-Separated Paths const std::string PROMPT_MODEL_PATHS="\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte," + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,"; // Comma-Separated Paths const std::string GEN_MODEL_PATHS="\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\ /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte," + /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,"; } // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index 81de2bb8415..c1c94642ab9 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -45,7 +45,7 @@ */ #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h" -// #include +#include #include #include @@ -63,16 +63,12 @@ #include #include -#include "llama_runner/LlamaConfig.h" -#include "llama_runner/LlamaRuntime.h" #include "llama_runner/ModelChunk.h" #include "llama_runner/Utils.h" #include "llama_runner/llm_helper/include/llm_types.h" #include "llama_runner/llm_helper/include/llama_runner_values.h" -#include -#include - +static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate. // Global BOS and EOS option for tokenization (encoding) static constexpr int8_t kAddBos = 1; static constexpr int8_t kAddEos = 0; @@ -81,7 +77,7 @@ using namespace torch::executor; using namespace torch::executor::llm_helper; using torch::executor::utils::Timer; -Runner::MTKLlamaRunner( +MTKLlamaRunner::MTKLlamaRunner( const std::string& model_path, const std::string& tokenizer_path, const float temperature) @@ -92,7 +88,7 @@ Runner::MTKLlamaRunner( "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files."); } -Error Runner::load() { +Error MTKLlamaRunner::load() { if (is_loaded()) { return Error::Ok; } @@ -102,15 +98,16 @@ Error Runner::load() { tokenizer_ = load_tokenizer(); // Load prompt model + runtime_ = std::make_unique(); ET_LOG(Info, "Loading prompt model."); runtime_->Initialize(modeloptions_, modelpaths_); } -bool Runner::is_loaded() const { +bool MTKLlamaRunner::is_loaded() const { return tokenizer_ && runtime_; } -Error Runner::generate( +Error MTKLlamaRunner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, @@ -131,10 +128,10 @@ Error Runner::generate( }; ET_LOG(Info, "Starting inference."); - inference(runtime_, tokenizer_, prompt, wrapped_callback); + inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback); } -void Runner::stop() { +void MTKLlamaRunner::stop() { if (is_loaded()) { runtime_->Release(); } else { @@ -142,7 +139,7 @@ void Runner::stop() { } } -LlamaModelOptions get_model_options() { +LlamaModelOptions MTKLlamaRunner::get_model_options() { LlamaModelOptions options = { // Sizes .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE, @@ -162,7 +159,7 @@ LlamaModelOptions get_model_options() { return options; } -LlamaModelPaths get_model_paths() { +LlamaModelPaths MTKLlamaRunner::get_model_paths() { LlamaModelPaths model_paths = { .tokenizer_path = TOKENIZER_PATH, .token_embedding_path = TOKEN_EMBEDDING_PATH, @@ -171,7 +168,7 @@ LlamaModelPaths get_model_paths() { return model_paths; } -Result digest_prompt( +Result MTKLlamaRunner::digest_prompt( LlamaRuntime& llama_runtime, const std::unique_ptr& tokenizer, const std::vector input_tokens) { @@ -223,7 +220,7 @@ Result digest_prompt( return first_output_token; } -Error gen_response( +Error MTKLlamaRunner::gen_response( LlamaRuntime& llama_runtime, const std::unique_ptr& tokenizer, const uint64_t input_token, @@ -259,8 +256,8 @@ Error gen_response( // Print first output token token_callback(full_response); - while (gen_tok_count++ < model_options_.max_response && - llama_runtime.GetTokenIndex() < model_options_.max_token_length) { + while (gen_tok_count++ < MAX_RESPONSE && + llama_runtime.GetTokenIndex() < modeloptions_.max_token_length) { timer_gen_token.Start(); void* logits = llama_runtime.Run({output_token}); timer_gen_token.End(); @@ -296,10 +293,10 @@ Error gen_response( return Error::Ok; } -Error inference( +Error MTKLlamaRunner::inference( LlamaRuntime& llama_runtime, const std::unique_ptr& tokenizer, - const std::string& prompt + const std::string& prompt, std::function token_callback) { // Tokenize input prompt auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos); @@ -317,10 +314,10 @@ Error inference( return gen_response(llama_runtime, tokenizer, first_output_token, token_callback); } -std::unique_ptr load_tokenizer() { +std::unique_ptr MTKLlamaRunner::load_tokenizer() { std::unique_ptr tokenizer; // Assumes that tokenizer type is Tiktoken - tokenizer = std::make_unique(); + tokenizer = torch::executor::get_tiktoken_for_llama(); tokenizer->load(modelpaths_.tokenizer_path); return tokenizer; } diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h new file mode 100644 index 00000000000..d9f85c20257 --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_llama_runner.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama_runner/LlamaConfig.h" +#include "llama_runner/LlamaRuntime.h" +using namespace torch::executor; +using Stats = ::executorch::llm::Stats; + +class MTKLlamaRunner { + public: + explicit MTKLlamaRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature = 0.8f); + + bool is_loaded() const; + Error load(); + Error generate( + const std::string& prompt, + int32_t seq_len = 128, + std::function token_callback = {}, + std::function stats_callback = {}); + void stop(); + + LlamaModelOptions get_model_options(); + LlamaModelPaths get_model_paths(); + Result digest_prompt( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::vector input_tokens); + Error gen_response( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const uint64_t input_token, + std::function token_callback); + Error inference( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::string& prompt, + std::function token_callback); + std::unique_ptr load_tokenizer(); + + + private: + // model + const torch::executor::LlamaModelOptions modeloptions_; + const torch::executor::LlamaModelPaths modelpaths_; + std::unique_ptr tokenizer_; + std::unique_ptr runtime_; +}; From 626858ee25fb678e34ae161a43f20032e4b49a45 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 5 Sep 2024 10:56:17 -0700 Subject: [PATCH 06/22] Debug in progress --- build/build_android_llm_demo.sh | 2 ++ extension/android/CMakeLists.txt | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 66752092625..8ca9680d4dd 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -67,6 +67,8 @@ build_android_native_library() { -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ + -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ + -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 23f5ac631bc..128c17cf71a 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -101,7 +101,17 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) target_link_options_shared_lib(quantized_ops_lib) - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp) + set( + LLAMA_JNI_SRCS jni/jni_layer_llama.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp + ) add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS}) if(TARGET pthreadpool) target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1) @@ -118,6 +128,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) endif() target_include_directories( executorch_llama_jni PRIVATE ${_common_include_directories} + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/ + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner ) target_link_libraries( executorch_llama_jni @@ -129,6 +141,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) eigen_blas quantized_kernels quantized_ops_lib + ${NEURON_BUFFER_ALLOCATOR_LIB} + ) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) # link re2 From 7242dece1947ed8ae2da6d34ccd4d1acc502a24e Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 5 Sep 2024 11:02:55 -0700 Subject: [PATCH 07/22] find lib --- extension/android/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 128c17cf71a..891f051d72e 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -141,9 +141,13 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) eigen_blas quantized_kernels quantized_ops_lib - ${NEURON_BUFFER_ALLOCATOR_LIB} - ) + ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED) + FIND_LIBRARY(libneuron_buffer_allocator_LIBRARY libneuron_buffer_allocator PATHS /home/hsz/e3/executorch/libneuron_buffer_allocator.so) + SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION "${libneuron_buffer_allocator_LIBRARY}") + SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_IMPLIB "${libneuron_buffer_allocator_LIBRARY}") + + target_link_libraries(executorch_llama_jni libneuron_buffer_allocator) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) # link re2 set(ABSL_ENABLE_INSTALL ON) From 605c0dde2a67450974d69bfad308ceed33fea403 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Tue, 10 Sep 2024 23:01:43 -0700 Subject: [PATCH 08/22] aar generates but with libneuron_backend.so error --- build/build_android_llm_demo.sh | 2 +- extension/android/CMakeLists.txt | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 8ca9680d4dd..8a9be879e88 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -124,7 +124,7 @@ collect_artifacts_to_be_uploaded() { BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR -ANDROID_ABIS=("arm64-v8a" "x86_64") +ANDROID_ABIS=("arm64-v8a") export ANDROID_ABIS ARTIFACTS_DIR_NAME="$1" diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 891f051d72e..cfd747a7228 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -143,9 +143,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) quantized_ops_lib ) ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED) - FIND_LIBRARY(libneuron_buffer_allocator_LIBRARY libneuron_buffer_allocator PATHS /home/hsz/e3/executorch/libneuron_buffer_allocator.so) - SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION "${libneuron_buffer_allocator_LIBRARY}") - SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_IMPLIB "${libneuron_buffer_allocator_LIBRARY}") + SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so) target_link_libraries(executorch_llama_jni libneuron_buffer_allocator) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) From 0c2d0418ee68aa809eea36a471ce3a3daa096467 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Tue, 10 Sep 2024 23:03:58 -0700 Subject: [PATCH 09/22] trying to fix libneuron_backend.so error --- extension/android/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index cfd747a7228..0aec0774ecf 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -145,7 +145,10 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED) SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so) - target_link_libraries(executorch_llama_jni libneuron_buffer_allocator) + ADD_LIBRARY(libneuron_backend SHARED IMPORTED) + SET_PROPERTY(TARGET libneuron_backend PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/cmake-out-android-arm64-v8a/backends/mediatek/libneuron_backend.so) + + target_link_libraries(executorch_llama_jni libneuron_buffer_allocator libneuron_backend) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) # link re2 set(ABSL_ENABLE_INSTALL ON) From fda0f6287e08b22ad971fba40514b552da246c42 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Wed, 11 Sep 2024 17:03:28 -0700 Subject: [PATCH 10/22] includes libneuron_backend.so error but issue with neuron_backend --- build/build_android_llm_demo.sh | 5 ++++- extension/android/CMakeLists.txt | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 8a9be879e88..7b367366dac 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -78,6 +78,9 @@ build_android_native_library() { # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + + cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ } build_aar() { @@ -91,7 +94,7 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so AndroidManifest.xml popd } diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 0aec0774ecf..cfd747a7228 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -145,10 +145,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED) SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so) - ADD_LIBRARY(libneuron_backend SHARED IMPORTED) - SET_PROPERTY(TARGET libneuron_backend PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/cmake-out-android-arm64-v8a/backends/mediatek/libneuron_backend.so) - - target_link_libraries(executorch_llama_jni libneuron_buffer_allocator libneuron_backend) + target_link_libraries(executorch_llama_jni libneuron_buffer_allocator) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) # link re2 set(ABSL_ENABLE_INSTALL ON) From 8cab38e059f4942ea823337b2d9152a5016d7578 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:27:14 -0700 Subject: [PATCH 11/22] resolves neuron_backend issue but now issue on .so or properly linking mtk runner --- backends/mediatek/CMakeLists.txt | 2 +- build/build_android_llm_demo.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 60c08fe8757..289944fbeba 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -27,7 +27,7 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) add_library(neuron_backend SHARED) target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions") target_link_libraries( - neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log + neuron_backend PRIVATE executorch_no_prim_ops android log ${NEURON_BUFFER_ALLOCATOR_LIB} ) target_sources( diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 7b367366dac..b3fe6d4cb1c 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -81,6 +81,7 @@ build_android_native_library() { cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ } build_aar() { @@ -94,7 +95,7 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml popd } From e5166ac1635cddd952315eb15a75eac5ab1bbc7f Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 11 Sep 2024 22:30:47 -0700 Subject: [PATCH 12/22] hack route to mtk runner --- extension/android/jni/jni_layer_llama.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index e6a9b5de58c..4b1284a7842 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -68,6 +69,7 @@ class ExecuTorchLlamaJni int model_type_category_; std::unique_ptr runner_; std::unique_ptr multi_modal_runner_; + std::unique_ptr mtk_llama_runner_; public: constexpr static auto kJavaDescriptor = @@ -75,6 +77,7 @@ class ExecuTorchLlamaJni constexpr static int MODEL_TYPE_CATEGORY_LLM = 1; constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2; + constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3; static facebook::jni::local_ref initHybrid( facebook::jni::alias_ref, @@ -113,7 +116,11 @@ class ExecuTorchLlamaJni model_path->toStdString().c_str(), tokenizer_path->toStdString().c_str(), temperature); - } + } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_ = std::make_unique( + model_path->toStdString().c_str(), + tokenizer_path->toStdString().c_str(), + temperature); } jint generate( @@ -152,6 +159,13 @@ class ExecuTorchLlamaJni [callback](std::string result) { callback->onResult(result); }, [callback](const Stats& result) { callback->onStats(result); }, echo); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_->generate( + prompt->toStdString(), + seq_len, + [callback](std::string result) { callback->onResult(result); }, + [callback](const Stats& result) { callback->onStats(result); }, + echo); } return 0; } @@ -243,6 +257,8 @@ class ExecuTorchLlamaJni multi_modal_runner_->stop(); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { runner_->stop(); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_->stop(); } } @@ -251,6 +267,8 @@ class ExecuTorchLlamaJni return static_cast(multi_modal_runner_->load()); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { return static_cast(runner_->load()); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + return static_cast(mtk_llama_runner_->load()); } return static_cast(Error::InvalidArgument); } From 22d9b110943765e7e025980476947accae48ec0a Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 11 Sep 2024 22:31:55 -0700 Subject: [PATCH 13/22] Make 1 until cmodi changes java side lol --- extension/android/jni/jni_layer_llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 4b1284a7842..6204eca3ceb 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -77,7 +77,7 @@ class ExecuTorchLlamaJni constexpr static int MODEL_TYPE_CATEGORY_LLM = 1; constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2; - constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3; + constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */; static facebook::jni::local_ref initHybrid( facebook::jni::alias_ref, From 6107dc745336b0a07082d90ec9b57d779bcad8ba Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Wed, 11 Sep 2024 23:08:06 -0700 Subject: [PATCH 14/22] compilation fixes --- extension/android/jni/jni_layer_llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 6204eca3ceb..b33139a2743 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -121,6 +121,7 @@ class ExecuTorchLlamaJni model_path->toStdString().c_str(), tokenizer_path->toStdString().c_str(), temperature); + } } jint generate( @@ -164,8 +165,7 @@ class ExecuTorchLlamaJni prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, - [callback](const Stats& result) { callback->onStats(result); }, - echo); + [callback](const Stats& result) { callback->onStats(result); }); } return 0; } From 1aa2ccdd44038d81d854344f732522a4f21d6c3e Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Thu, 12 Sep 2024 12:07:43 -0700 Subject: [PATCH 15/22] resolved no op issue and bug in modelType. Issue with loading models --- backends/mediatek/CMakeLists.txt | 1 - .../llm_helper/include/llama_runner_values.h | 16 ++++------------ .../executor_runner/mtk_llama_runner.cpp | 4 ++++ extension/android/jni/jni_layer_llama.cpp | 2 +- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index a444a6ff8ab..9a9c82d90a9 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -29,7 +29,6 @@ target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions") target_link_libraries(neuron_backend PRIVATE executorch_no_prim_ops - portable_ops_lib android log ${NEURON_BUFFER_ALLOCATOR_LIB} diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h index aeff7254de1..1f388b2acdc 100644 --- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h +++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h @@ -20,21 +20,13 @@ namespace torch::executor { const LLMType ROT_EMB_TYPE = LLMType::FP32; // Paths - const std::string TOKENIZER_PATH="/data/local/tmp/llama3/tokenizer.model"; - const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/llama3/embedding_llama3_8b_instruct_fp32.bin"; + const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model"; + const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3_8b_instruct_fp32.bin"; // Comma-Separated Paths - const std::string PROMPT_MODEL_PATHS="\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_0.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_1.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_2.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_128t512c_3.pte,"; + const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,"; // Comma-Separated Paths - const std::string GEN_MODEL_PATHS="\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_0.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_1.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_2.pte,\ - /data/local/tmp/llama3/llama3_8b_SC_sym4W_sym16A_4_chunks_Overall_1t512c_3.pte,"; + const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,"; } // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index c1c94642ab9..242dc18101d 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -96,11 +96,13 @@ Error MTKLlamaRunner::load() { // Load tokenizer ET_LOG(Info, "Loading tokenizer."); tokenizer_ = load_tokenizer(); + ET_LOG(Info, "Complete loading tokenizer."); // Load prompt model runtime_ = std::make_unique(); ET_LOG(Info, "Loading prompt model."); runtime_->Initialize(modeloptions_, modelpaths_); + ET_LOG(Info, "Complete loading prompt model."); } bool MTKLlamaRunner::is_loaded() const { @@ -156,6 +158,7 @@ LlamaModelOptions MTKLlamaRunner::get_model_options() { .cache_type = CACHE_TYPE, .mask_type = MASK_TYPE, .rot_emb_type = ROT_EMB_TYPE}; + ET_LOG(Info, "Completed get_model_options"); return options; } @@ -165,6 +168,7 @@ LlamaModelPaths MTKLlamaRunner::get_model_paths() { .token_embedding_path = TOKEN_EMBEDDING_PATH, .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','), .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')}; + ET_LOG(Info, "Completed get_model_paths"); return model_paths; } diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index b33139a2743..50476df5690 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -75,7 +75,7 @@ class ExecuTorchLlamaJni constexpr static auto kJavaDescriptor = "Lorg/pytorch/executorch/LlamaModule;"; - constexpr static int MODEL_TYPE_CATEGORY_LLM = 1; + constexpr static int MODEL_TYPE_CATEGORY_LLM = 3 /* should be put back to 1*/; constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2; constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */; From 6fed4cbe47eeef0e0e0f0d7d5ab68fba55ddaff3 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Thu, 12 Sep 2024 12:34:14 -0700 Subject: [PATCH 16/22] *MILESTONE* debugging logs for model loading in aar --- .../llama_runner/LlamaRuntime.cpp | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp index 0d2d5ccd59c..88ac44fc41d 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp @@ -28,14 +28,21 @@ void LlamaRuntime::Initialize( const size_t numCache = 2 * modelOptions.num_layer / numChunk; ET_CHECK_MSG(numChunk > 0, "No model to initialize"); + ET_LOG(Info, "cmodiii 1"); + ET_LOG(Info, "cmodiii numChunk = %zu", numChunk); + ET_LOG(Info, "cmodiii numCache = %zu", numCache); + // Initialize rotary embedding master lookup table const size_t rotEmbDim = modelOptions.hidden_size / modelOptions.num_head; + ET_LOG(Info, "cmodiii 2"); mRotEmbMasterLut = std::make_unique( modelOptions.rot_emb_type, modelOptions.max_token_length, rotEmbDim, modelOptions.rot_emb_base); + ET_LOG(Info, "cmodiii 3"); mRotEmbMasterLut->generate(); + ET_LOG(Info, "cmodiii 4"); constexpr size_t numRotEmbInputs = 1; const bool usePromptModel = !modelPaths.prompt_model_paths.empty(); @@ -50,8 +57,10 @@ void LlamaRuntime::Initialize( return; modelPathMap[batchSize] = modelPaths[chunkIdx]; }; + ET_LOG(Info, "cmodiii 5"); addModelPath( modelPaths.prompt_model_paths, modelOptions.prompt_token_batch_size); + ET_LOG(Info, "cmodiii 6"); addModelPath(modelPaths.gen_model_paths, 1); auto llamaChunk = std::make_unique( modelPathMap, @@ -60,18 +69,25 @@ void LlamaRuntime::Initialize( numCache, numRotEmbInputs, mRotEmbMasterLut.get()); + ET_LOG(Info, "cmodiii 7"); mLlamaModelChunks.push_back(std::move(llamaChunk)); + ET_LOG(Info, "cmodiii 8"); } for (size_t i = 0; i < numChunk; i++) { auto& modelChunk = mLlamaModelChunks[i]; + ET_LOG(Info, "cmodiii 9"); if (i > 0) { const auto& prevModelChunk = mLlamaModelChunks[i - 1]; + ET_LOG(Info, "cmodiii 9A"); modelChunk->SetInputBuffer(prevModelChunk->GetOutputBuffer()); + ET_LOG(Info, "cmodiii 10"); } modelChunk->Initialize(); + ET_LOG(Info, "cmodiii 11"); // modelChunk->LogIoSummary(); } + ET_LOG(Info, "cmodiii 12"); // NOTE: Token embedding type here is assumed to follow the model input // embedding type. @@ -80,9 +96,13 @@ void LlamaRuntime::Initialize( modelOptions.model_input_type, modelOptions.hidden_size); + ET_LOG(Info, "cmodiii 13"); + // Link first chunk emb input to token emb lut output const auto& tokenEmbInput = mLlamaModelChunks.front()->GetInputBuffer(); + ET_LOG(Info, "cmodiii 14"); mTokenEmbLut->setOutput(tokenEmbInput.data, tokenEmbInput.nbytes); + ET_LOG(Info, "cmodiii 15"); } void LlamaRuntime::Release() { @@ -201,4 +221,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const { return mModelOptions; } -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor From 8a2d9c14f8209670796e666d3bad57af8836b88c Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Fri, 13 Sep 2024 10:08:34 -0700 Subject: [PATCH 17/22] debugging with moving some to shell --- .../llama_runner/LlamaRuntime.cpp | 18 +++++++++++++ .../llama_runner/ModelChunk.cpp | 6 +++++ .../mtk_llama_executor_runner.cpp | 26 +++++++++++-------- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp index 88ac44fc41d..8a12ce90ecb 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp @@ -70,12 +70,30 @@ void LlamaRuntime::Initialize( numRotEmbInputs, mRotEmbMasterLut.get()); ET_LOG(Info, "cmodiii 7"); + if(llamaChunk.get() == nullptr) { + ET_LOG(Info, "cmodiii llamaChunk is null"); + } else { + ET_LOG(Info, "cmodiii llamaChunk is not null"); + } + mLlamaModelChunks.push_back(std::move(llamaChunk)); + + if(mLlamaModelChunks.empty()) { + ET_LOG(Info, "cmodiii mLlamaModelChunks is empty"); + } else { + ET_LOG(Info, "cmodiii mLlamaModelChunks is not empty"); + } + ET_LOG(Info, "cmodiii 8"); } for (size_t i = 0; i < numChunk; i++) { auto& modelChunk = mLlamaModelChunks[i]; + if(modelChunk.get() == nullptr) { + ET_LOG(Info, "cmodiii modelChunk is null"); + } else { + ET_LOG(Info, "cmodiii modelChunk is not null"); + } ET_LOG(Info, "cmodiii 9"); if (i > 0) { const auto& prevModelChunk = mLlamaModelChunks[i - 1]; diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp index b09e2c58767..a7e858dfa93 100644 --- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp @@ -46,11 +46,17 @@ struct ModelInstance { }; void ModelChunk::Initialize() { + ET_LOG(Info, "cmodiii in ModuleChunk::Initialize"); LoadModels(); + ET_LOG(Info, "cmodiii after LoadModels"); GetModelIoInfo(); + ET_LOG(Info, "cmodiii after GetModelIoInfo"); AllocateIoBuffers(); + ET_LOG(Info, "cmodiii after AllocateIoBuffers"); SetBackendInputs(); + ET_LOG(Info, "cmodiii after SetBackendInputs"); SetBackendOutputs(); + ET_LOG(Info, "cmodiii after SetBackendOutputs"); mIsInitialized = true; } diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index 370695cb773..1193e2b1830 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -147,11 +147,11 @@ LlamaModelOptions get_model_options() { .rot_emb_base = FLAGS_rot_emb_base, // Types - .model_input_type = getLLMTypeFromName(FLAGS_input_type.c_str()), - .model_output_type = getLLMTypeFromName(FLAGS_output_type.c_str()), - .cache_type = getLLMTypeFromName(FLAGS_cache_type.c_str()), - .mask_type = getLLMTypeFromName(FLAGS_mask_type.c_str()), - .rot_emb_type = getLLMTypeFromName(FLAGS_rot_emb_type.c_str())}; + .model_input_type = LLMType::FP32, + .model_output_type = LLMType::FP32, + .cache_type = LLMType::FP32, + .mask_type = LLMType::FP32, + .rot_emb_type = LLMType::FP32}; return options; } @@ -159,8 +159,8 @@ LlamaModelPaths get_model_paths() { LlamaModelPaths model_paths = { .tokenizer_path = FLAGS_tokenizer_path, .token_embedding_path = FLAGS_token_embedding_path, - .prompt_model_paths = utils::split(FLAGS_prompt_model_paths, ','), - .gen_model_paths = utils::split(FLAGS_gen_model_paths, ',')}; + .prompt_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,", ','), + .gen_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,", ',')}; return model_paths; } @@ -353,23 +353,27 @@ int main(int argc, char** argv) { Timer timer_release( [](const auto elapsed_sec) { ET_LOG(Info, "Model released."); }); - LlamaRuntime llama_runtime; + //LlamaRuntime llama_runtime; + std::unique_ptr llama_runtime = std::make_unique(); // Initialize model ET_LOG(Info, "Begin model loading."); timer_init.Start(); const auto tokenizer = load_tokenizer(); - llama_runtime.Initialize(model_options, model_paths); + //llama_runtime.Initialize(model_options, model_paths); + llama_runtime->Initialize(model_options, model_paths); timer_init.End(); // Run model ET_CHECK_MSG(!FLAGS_prompt_file.empty(), "No prompt file provided."); std::string prompt = utils::read_file(FLAGS_prompt_file); - inference(llama_runtime, tokenizer, prompt); + //inference(llama_runtime, tokenizer, prompt); + inference(*llama_runtime.get(), tokenizer, prompt); // Release model timer_release.Start(); - llama_runtime.Release(); + //llama_runtime.Release(); + llama_runtime->Release(); timer_release.End(); return 0; From 72aa142cc2ed6fc69228f58ad1e390e331735952 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:45:36 -0700 Subject: [PATCH 18/22] many debug prints --- .../llama_runner/LlamaModelChunk.cpp | 10 ++++++++++ .../llama_runner/MultiModelLoader.cpp | 16 +++++++++++++++- .../executor_runner/mtk_llama_runner.cpp | 3 ++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp index c2d75fd30ec..1757c63fe21 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp @@ -73,16 +73,26 @@ size_t LlamaModelChunk::GetExpectedOutputCount() const { } void LlamaModelChunk::Initialize() { + ET_LOG(Info, "cmodiii in LlamaModelChunk::Initialize"); LoadModels(); + ET_LOG(Info, "cmodiii after LoadModels"); GetModelIoInfo(); + ET_LOG(Info, "cmodiii after GetModelIoInfo"); CheckIoCount(); + ET_LOG(Info, "cmodiii after CheckIoCount"); PrepareCacheIOs(); + ET_LOG(Info, "cmodiii after PrepareCacheIOs"); AllocateIoBuffers(); + ET_LOG(Info, "cmodiii after AllocateIoBuffers"); InitMaskBuilder(); + ET_LOG(Info, "cmodiii after InitMaskBuilder"); InitCache(); + ET_LOG(Info, "cmodiii after InitCache"); SetBackendInputs(); + ET_LOG(Info, "cmodiii after SetBackendInputs"); SetBackendOutputs(); + ET_LOG(Info, "cmodiii after SetBackendOutputs"); mIsInitialized = true; } diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp index e0479110a7c..e20eac3b248 100644 --- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp +++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp @@ -21,20 +21,28 @@ namespace torch::executor { template void MultiModelLoader::LoadModels() { // Init empty model instance map + ET_LOG(Info, "cmodi LoadModels() 1"); for (const auto& [id, _] : mModelPathMap) { ET_CHECK_MSG( !HasModel(id), "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 2"); mModelInstanceMap[id] = nullptr; } const size_t numModels = mModelPathMap.size(); + ET_LOG(Info, "cmodi LoadModels() 3"); if (!AllowModelsCoexist()) { + ET_LOG(Info, "cmodi LoadModels() 4"); SelectModel(mDefaultModelId); + ET_LOG(Info, "cmodi LoadModels() 5"); ET_CHECK_MSG( GetModelInstance() == nullptr, "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 6"); void* instance = CreateModelInstance(mModelPathMap[mDefaultModelId]); + ET_LOG(Info, "cmodi LoadModels() 7"); SetModelInstance(instance); + ET_LOG(Info, "cmodi LoadModels() 8"); ET_LOG( Debug, "LoadModels(): Loaded single exclusive model (Total=%zu)", @@ -42,14 +50,20 @@ void MultiModelLoader::LoadModels() { return; } for (const auto& [id, modelPath] : mModelPathMap) { + ET_LOG(Info, "cmodi LoadModels() 9"); SelectModel(id); + ET_LOG(Info, "cmodi LoadModels() 10"); ET_CHECK_MSG( GetModelInstance() == nullptr, "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 11"); void* instance = CreateModelInstance(modelPath); + ET_LOG(Info, "cmodi LoadModels() 12"); SetModelInstance(instance); + ET_LOG(Info, "cmodi LoadModels() 13"); } SelectModel(mDefaultModelId); // Select the default instance + ET_LOG(Info, "cmodi LoadModels() 14"); ET_LOG(Debug, "LoadModels(): Loaded multiple models (Total=%zu)", numModels); } @@ -174,4 +188,4 @@ std::string MultiModelLoader::GetIdString(const IdType& id) { template class MultiModelLoader; template class MultiModelLoader; -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index 242dc18101d..72296de45ac 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -83,9 +83,10 @@ MTKLlamaRunner::MTKLlamaRunner( const float temperature) : modeloptions_(get_model_options()), modelpaths_(get_model_paths()) { + runtime_init(); ET_LOG( Info, - "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files."); + "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init()."); } Error MTKLlamaRunner::load() { From 91e4d6a843d0f52f874df66e6cda049c96e113a4 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Mon, 16 Sep 2024 20:40:04 -0700 Subject: [PATCH 19/22] logs and app changes for debug --- build/build_android_llm_demo.sh | 8 ++++---- .../executorchllamademo/MainActivity.java | 20 ++++++++++--------- .../executorchllamademo/SettingsActivity.java | 6 ++++-- .../llama_runner/ModelChunk.cpp | 17 ++++++++++++++++ 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 8047c947067..d8de4cfd94e 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -46,7 +46,7 @@ build_android_native_library() { -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -B"${CMAKE_OUT}" if [ "$(uname)" == "Darwin" ]; then @@ -54,7 +54,7 @@ build_android_native_library() { else CMAKE_JOBS=$(( $(nproc) - 1 )) fi - cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release + cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config RelWithDebInfo cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ @@ -67,10 +67,10 @@ build_android_native_library() { -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -B"${CMAKE_OUT}"/extension/android - cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release + cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config RelWithDebInfo # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index f5e50845eca..a08bf87a079 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -156,11 +156,11 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera + " sec." + " You can send text or image for inference"; - if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + /*if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { ETLogging.getInstance().log("Llava start prefill prompt"); startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0); ETLogging.getInstance().log("Llava completes prefill prompt"); - } + }*/ } Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); @@ -226,6 +226,9 @@ protected void onCreate(Bundle savedInstanceState) { try { Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); + Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); + ETLogging.getInstance().log("cmodiiiii ADSP_LIBRARY_PATH is: " + Os.getenv("ADSP_LIBRARY_PATH")); + ETLogging.getInstance().log("cmodiiiii LD_LIBRARY_PATH is: " + Os.getenv("LD_LIBRARY_PATH")); } catch (ErrnoException e) { finish(); } @@ -566,7 +569,7 @@ private void showMediaPreview(List uris) { // For LLava, we want to call prefill_image as soon as an image is selected // Llava only support 1 image for now - if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { +/* if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { List processedImageList = getProcessedImagesForModel(mSelectedImageUri); if (!processedImageList.isEmpty()) { mMessageAdapter.add( @@ -588,7 +591,7 @@ private void showMediaPreview(List uris) { }; executor.execute(runnable); } - } + }*/ } private void addSelectedImagesToChatThread(List selectedImageUri) { @@ -689,7 +692,7 @@ public void run() { } }); long generateStartTime = System.currentTimeMillis(); - if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + /* if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) == ModelUtils.VISION_MODEL) { mModule.generateFromPos( mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt), @@ -697,16 +700,15 @@ public void run() { startPos, MainActivity.this, false); - } else { + } else {*/ String finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt); ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); mModule.generate( finalPrompt, (int) (finalPrompt.length() * 0.75) + 64, - MainActivity.this, - false); - } + MainActivity.this); + //} long generateDuration = System.currentTimeMillis() - generateStartTime; mResultMessage.setTotalGenerationTime(generateDuration); diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 773fef19dd7..9d7d2f4ec2a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -46,6 +46,8 @@ public class SettingsActivity extends AppCompatActivity { private DemoSharedPreferences mDemoSharedPreferences; public static double TEMPERATURE_MIN_VALUE = 0.0; + public static String MODEL_PATH="/data/local/tmp/et-mtk/llama3"; + //public static String MODEL_PATH="/data/local/tmp/llama"; @Override protected void onCreate(Bundle savedInstanceState) { @@ -286,7 +288,7 @@ private void showInvalidPromptDialog() { } private void setupModelSelectorDialog() { - String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); + String[] pteFiles = listLocalFile(MODEL_PATH, ".pte"); AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); modelPathBuilder.setTitle("Select model path"); @@ -342,7 +344,7 @@ private void setupModelTypeSelectorDialog() { } private void setupTokenizerSelectorDialog() { - String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); + String[] binFiles = listLocalFile(MODEL_PATH, ".bin"); String[] tokenizerFiles = new String[binFiles.length]; System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp index a7e858dfa93..2c7e236968d 100644 --- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp @@ -486,18 +486,22 @@ Method& ModelChunk::GetModelMethod() { // Override the virtual functions void* ModelChunk::CreateModelInstance(const std::string& modelPath) { + ET_LOG(Info, "cmodi in CreateModelInstance"); auto modelInstance = new ModelInstance; + ET_LOG(Info, "cmodi 100"); // Create a loader to get the data of the program file. There are other // DataLoaders that use mmap() or point to data that's already in memory, and // users can create their own DataLoaders to load from arbitrary sources. Result loader = FileDataLoader::from(modelPath.c_str()); + ET_LOG(Info, "cmodi 101"); ET_CHECK_MSG( loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error()); // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. Result program_loaded = Program::load(&loader.get()); + ET_LOG(Info, "cmodi 102"); if (!program_loaded.ok()) { ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str()); return nullptr; @@ -508,12 +512,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { // methods. modelInstance->program = std::make_unique(std::move(program_loaded.get())); + ET_LOG(Info, "cmodi 103"); auto& program = modelInstance->program; + ET_LOG(Info, "cmodi 104"); // Use the first method in the program. const char* method_name = nullptr; { const auto method_name_result = program->get_method_name(0); + ET_LOG(Info, "cmodi 105"); ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); method_name = *method_name_result; } @@ -530,12 +537,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize); modelInstance->method_allocator = std::make_unique( kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data()); + ET_LOG(Info, "cmodi 106"); auto& method_allocator = modelInstance->method_allocator; method_allocator->enable_profiling("method allocator"); auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator + ET_LOG(Info, "cmodi 107"); + size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); for (size_t id = 0; id < num_memory_planned_buffers; ++id) { // .get() will always succeed because id < num_memory_planned_buffers. @@ -545,22 +555,28 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { planned_buffers.push_back(std::make_unique(buffer_size)); planned_spans.push_back({planned_buffers.back().get(), buffer_size}); } + ET_LOG(Info, "cmodi 108"); modelInstance->planned_memory = std::make_unique( Span>{planned_spans.data(), planned_spans.size()}); auto& planned_memory = modelInstance->planned_memory; + ET_LOG(Info, "cmodi 109"); // Assemble all of the allocators into the MemoryManager that the Executor // will use. auto& neuron_allocator = GET_NEURON_ALLOCATOR; + ET_LOG(Info, "cmodi 110"); modelInstance->memory_manager = std::make_unique( method_allocator.get(), planned_memory.get(), dynamic_cast(&neuron_allocator)); + ET_LOG(Info, "cmodi 111"); auto& memory_manager = modelInstance->memory_manager; + ET_LOG(Info, "cmodi 112"); ET_LOG(Debug, "Begin loading method %s", method_name); Result method = program->load_method(method_name, memory_manager.get()); + ET_LOG(Info, "cmodi 113"); ET_CHECK_MSG( method.ok(), "Loading of method %s failed with status 0x%" PRIx32, @@ -569,6 +585,7 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { ET_LOG(Debug, "Method loaded."); modelInstance->method = std::make_unique(std::move(method.get())); + ET_LOG(Info, "cmodi 114"); return modelInstance; } From aa98fb12343cd4592c5f5ed2a91b0d927d66e2f3 Mon Sep 17 00:00:00 2001 From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Date: Mon, 16 Sep 2024 22:23:21 -0700 Subject: [PATCH 20/22] so adds in androidmanifest and embedding file name correction --- .../android/LlamaDemo/app/src/main/AndroidManifest.xml | 8 ++++++++ .../llama_runner/llm_helper/include/llama_runner_values.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml index 02d8503a4df..e6ff8e95b87 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml @@ -36,6 +36,14 @@ android:name="libcdsprpc.so" android:required="false" /> + + + + Date: Fri, 27 Sep 2024 14:09:23 -0700 Subject: [PATCH 21/22] .so adds in AndroidManifest and fix type for embedding file name --- .../android/LlamaDemo/app/build.gradle.kts | 2 +- .../app/src/main/AndroidManifest.xml | 20 +++++++++++++++++-- .../llm_helper/include/llama_runner_values.h | 2 +- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 37c8cbf0ba2..039ea9b047f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -57,7 +57,7 @@ dependencies { implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch-llama.aar")) + implementation(files("libs/executorch-llama-mtk29.aar")) implementation("com.google.android.material:material:1.12.0") implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml index e6ff8e95b87..799ce50992f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml @@ -37,11 +37,27 @@ android:required="false" /> + + + + + + + + Date: Fri, 27 Sep 2024 15:59:38 -0700 Subject: [PATCH 22/22] add Error returns to runner. Baseline working flow. --- examples/demo-apps/android/LlamaDemo/app/build.gradle.kts | 2 +- .../java/com/example/executorchllamademo/MainActivity.java | 1 + examples/mediatek/executor_runner/mtk_llama_runner.cpp | 7 ++++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 039ea9b047f..db4ea8f74c6 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -57,7 +57,7 @@ dependencies { implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch-llama-mtk29.aar")) + implementation(files("libs/executorch-llama-mtk31.aar")) implementation("com.google.android.material:material:1.12.0") implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index a08bf87a079..fbd6948880f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -126,6 +126,7 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera modelPath, tokenizerPath, temperature); + ETLogging.getInstance().log("ModelType is: " + mCurrentSettingsFields.getModelType()); int loadResult = mModule.load(); long loadDuration = System.currentTimeMillis() - runStartTime; String modelLoadError = ""; diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index 72296de45ac..dbb5b79b42c 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -104,6 +104,8 @@ Error MTKLlamaRunner::load() { ET_LOG(Info, "Loading prompt model."); runtime_->Initialize(modeloptions_, modelpaths_); ET_LOG(Info, "Complete loading prompt model."); + + return Error::Ok; } bool MTKLlamaRunner::is_loaded() const { @@ -130,8 +132,11 @@ Error MTKLlamaRunner::generate( } }; - ET_LOG(Info, "Starting inference."); + ET_LOG(Info, "Starting inference from MTKLlamaRunner"); inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback); + ET_LOG(Info, "Completed inference from MTKLlamaRunner"); + + return Error::Ok; } void MTKLlamaRunner::stop() {