diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 744b1193d5a..9a9c82d90a9 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -25,10 +25,10 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) +target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions") target_link_libraries(neuron_backend PRIVATE executorch_no_prim_ops - portable_ops_lib android log ${NEURON_BUFFER_ALLOCATOR_LIB} diff --git a/backends/mediatek/runtime/include/NeuronLog.h b/backends/mediatek/runtime/include/NeuronLog.h index ccf8b24870d..5367a91ac4e 100644 --- a/backends/mediatek/runtime/include/NeuronLog.h +++ b/backends/mediatek/runtime/include/NeuronLog.h @@ -8,7 +8,7 @@ #pragma once -#include +#include "api/NeuronAdapter.h" #include #include diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index c5d0118afda..d8de4cfd94e 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -30,7 +30,7 @@ build_android_native_library() { cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ - -DANDROID_PLATFORM=android-23 \ + -DANDROID_PLATFORM=android-26 \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DEXECUTORCH_BUILD_XNNPACK=ON \ @@ -42,9 +42,11 @@ build_android_native_library() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_NEURON=ON \ + -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -B"${CMAKE_OUT}" if [ "$(uname)" == "Darwin" ]; then @@ -52,26 +54,32 @@ build_android_native_library() { else CMAKE_JOBS=$(( $(nproc) - 1 )) fi - cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release + cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config RelWithDebInfo cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ - -DANDROID_PLATFORM=android-23 \ + -DANDROID_PLATFORM=android-26 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ + -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ + -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -B"${CMAKE_OUT}"/extension/android - cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release + cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config RelWithDebInfo # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + # Copy QNN related so library if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" @@ -97,7 +105,7 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml popd } @@ -143,6 +151,7 @@ BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR if [ -z "$ANDROID_ABIS" ]; then ANDROID_ABIS=("arm64-v8a" "x86_64") + ANDROID_ABIS=("arm64-v8a") fi export ANDROID_ABIS diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 4376c9e5e77..c40f214133a 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -41,6 +41,7 @@ set(lib_list ${FLATCCRT_LIB} coremldelegate mpsdelegate + neuron_backend qnn_executorch_backend portable_ops_lib extension_module diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 37c8cbf0ba2..db4ea8f74c6 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -57,7 +57,7 @@ dependencies { implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch-llama.aar")) + implementation(files("libs/executorch-llama-mtk31.aar")) implementation("com.google.android.material:material:1.12.0") implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml index 02d8503a4df..799ce50992f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml @@ -36,6 +36,30 @@ android:name="libcdsprpc.so" android:required="false" /> + + + + + + + + + + + + uris) { // For LLava, we want to call prefill_image as soon as an image is selected // Llava only support 1 image for now - if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { +/* if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { List processedImageList = getProcessedImagesForModel(mSelectedImageUri); if (!processedImageList.isEmpty()) { mMessageAdapter.add( @@ -588,7 +592,7 @@ private void showMediaPreview(List uris) { }; executor.execute(runnable); } - } + }*/ } private void addSelectedImagesToChatThread(List selectedImageUri) { @@ -689,7 +693,7 @@ public void run() { } }); long generateStartTime = System.currentTimeMillis(); - if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + /* if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) == ModelUtils.VISION_MODEL) { mModule.generateFromPos( mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt), @@ -697,16 +701,15 @@ public void run() { startPos, MainActivity.this, false); - } else { + } else {*/ String finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt); ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); mModule.generate( finalPrompt, (int) (finalPrompt.length() * 0.75) + 64, - MainActivity.this, - false); - } + MainActivity.this); + //} long generateDuration = System.currentTimeMillis() - generateStartTime; mResultMessage.setTotalGenerationTime(generateDuration); diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 773fef19dd7..9d7d2f4ec2a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -46,6 +46,8 @@ public class SettingsActivity extends AppCompatActivity { private DemoSharedPreferences mDemoSharedPreferences; public static double TEMPERATURE_MIN_VALUE = 0.0; + public static String MODEL_PATH="/data/local/tmp/et-mtk/llama3"; + //public static String MODEL_PATH="/data/local/tmp/llama"; @Override protected void onCreate(Bundle savedInstanceState) { @@ -286,7 +288,7 @@ private void showInvalidPromptDialog() { } private void setupModelSelectorDialog() { - String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); + String[] pteFiles = listLocalFile(MODEL_PATH, ".pte"); AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); modelPathBuilder.setTitle("Select model path"); @@ -342,7 +344,7 @@ private void setupModelTypeSelectorDialog() { } private void setupTokenizerSelectorDialog() { - String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); + String[] binFiles = listLocalFile(MODEL_PATH, ".bin"); String[] tokenizerFiles = new String[binFiles.length]; System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp index c2d75fd30ec..1757c63fe21 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp @@ -73,16 +73,26 @@ size_t LlamaModelChunk::GetExpectedOutputCount() const { } void LlamaModelChunk::Initialize() { + ET_LOG(Info, "cmodiii in LlamaModelChunk::Initialize"); LoadModels(); + ET_LOG(Info, "cmodiii after LoadModels"); GetModelIoInfo(); + ET_LOG(Info, "cmodiii after GetModelIoInfo"); CheckIoCount(); + ET_LOG(Info, "cmodiii after CheckIoCount"); PrepareCacheIOs(); + ET_LOG(Info, "cmodiii after PrepareCacheIOs"); AllocateIoBuffers(); + ET_LOG(Info, "cmodiii after AllocateIoBuffers"); InitMaskBuilder(); + ET_LOG(Info, "cmodiii after InitMaskBuilder"); InitCache(); + ET_LOG(Info, "cmodiii after InitCache"); SetBackendInputs(); + ET_LOG(Info, "cmodiii after SetBackendInputs"); SetBackendOutputs(); + ET_LOG(Info, "cmodiii after SetBackendOutputs"); mIsInitialized = true; } diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp index 0d2d5ccd59c..8a12ce90ecb 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp @@ -28,14 +28,21 @@ void LlamaRuntime::Initialize( const size_t numCache = 2 * modelOptions.num_layer / numChunk; ET_CHECK_MSG(numChunk > 0, "No model to initialize"); + ET_LOG(Info, "cmodiii 1"); + ET_LOG(Info, "cmodiii numChunk = %zu", numChunk); + ET_LOG(Info, "cmodiii numCache = %zu", numCache); + // Initialize rotary embedding master lookup table const size_t rotEmbDim = modelOptions.hidden_size / modelOptions.num_head; + ET_LOG(Info, "cmodiii 2"); mRotEmbMasterLut = std::make_unique( modelOptions.rot_emb_type, modelOptions.max_token_length, rotEmbDim, modelOptions.rot_emb_base); + ET_LOG(Info, "cmodiii 3"); mRotEmbMasterLut->generate(); + ET_LOG(Info, "cmodiii 4"); constexpr size_t numRotEmbInputs = 1; const bool usePromptModel = !modelPaths.prompt_model_paths.empty(); @@ -50,8 +57,10 @@ void LlamaRuntime::Initialize( return; modelPathMap[batchSize] = modelPaths[chunkIdx]; }; + ET_LOG(Info, "cmodiii 5"); addModelPath( modelPaths.prompt_model_paths, modelOptions.prompt_token_batch_size); + ET_LOG(Info, "cmodiii 6"); addModelPath(modelPaths.gen_model_paths, 1); auto llamaChunk = std::make_unique( modelPathMap, @@ -60,18 +69,43 @@ void LlamaRuntime::Initialize( numCache, numRotEmbInputs, mRotEmbMasterLut.get()); + ET_LOG(Info, "cmodiii 7"); + if(llamaChunk.get() == nullptr) { + ET_LOG(Info, "cmodiii llamaChunk is null"); + } else { + ET_LOG(Info, "cmodiii llamaChunk is not null"); + } + mLlamaModelChunks.push_back(std::move(llamaChunk)); + + if(mLlamaModelChunks.empty()) { + ET_LOG(Info, "cmodiii mLlamaModelChunks is empty"); + } else { + ET_LOG(Info, "cmodiii mLlamaModelChunks is not empty"); + } + + ET_LOG(Info, "cmodiii 8"); } for (size_t i = 0; i < numChunk; i++) { auto& modelChunk = mLlamaModelChunks[i]; + if(modelChunk.get() == nullptr) { + ET_LOG(Info, "cmodiii modelChunk is null"); + } else { + ET_LOG(Info, "cmodiii modelChunk is not null"); + } + ET_LOG(Info, "cmodiii 9"); if (i > 0) { const auto& prevModelChunk = mLlamaModelChunks[i - 1]; + ET_LOG(Info, "cmodiii 9A"); modelChunk->SetInputBuffer(prevModelChunk->GetOutputBuffer()); + ET_LOG(Info, "cmodiii 10"); } modelChunk->Initialize(); + ET_LOG(Info, "cmodiii 11"); // modelChunk->LogIoSummary(); } + ET_LOG(Info, "cmodiii 12"); // NOTE: Token embedding type here is assumed to follow the model input // embedding type. @@ -80,9 +114,13 @@ void LlamaRuntime::Initialize( modelOptions.model_input_type, modelOptions.hidden_size); + ET_LOG(Info, "cmodiii 13"); + // Link first chunk emb input to token emb lut output const auto& tokenEmbInput = mLlamaModelChunks.front()->GetInputBuffer(); + ET_LOG(Info, "cmodiii 14"); mTokenEmbLut->setOutput(tokenEmbInput.data, tokenEmbInput.nbytes); + ET_LOG(Info, "cmodiii 15"); } void LlamaRuntime::Release() { @@ -201,4 +239,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const { return mModelOptions; } -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp index b09e2c58767..2c7e236968d 100644 --- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp @@ -46,11 +46,17 @@ struct ModelInstance { }; void ModelChunk::Initialize() { + ET_LOG(Info, "cmodiii in ModuleChunk::Initialize"); LoadModels(); + ET_LOG(Info, "cmodiii after LoadModels"); GetModelIoInfo(); + ET_LOG(Info, "cmodiii after GetModelIoInfo"); AllocateIoBuffers(); + ET_LOG(Info, "cmodiii after AllocateIoBuffers"); SetBackendInputs(); + ET_LOG(Info, "cmodiii after SetBackendInputs"); SetBackendOutputs(); + ET_LOG(Info, "cmodiii after SetBackendOutputs"); mIsInitialized = true; } @@ -480,18 +486,22 @@ Method& ModelChunk::GetModelMethod() { // Override the virtual functions void* ModelChunk::CreateModelInstance(const std::string& modelPath) { + ET_LOG(Info, "cmodi in CreateModelInstance"); auto modelInstance = new ModelInstance; + ET_LOG(Info, "cmodi 100"); // Create a loader to get the data of the program file. There are other // DataLoaders that use mmap() or point to data that's already in memory, and // users can create their own DataLoaders to load from arbitrary sources. Result loader = FileDataLoader::from(modelPath.c_str()); + ET_LOG(Info, "cmodi 101"); ET_CHECK_MSG( loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error()); // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. Result program_loaded = Program::load(&loader.get()); + ET_LOG(Info, "cmodi 102"); if (!program_loaded.ok()) { ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str()); return nullptr; @@ -502,12 +512,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { // methods. modelInstance->program = std::make_unique(std::move(program_loaded.get())); + ET_LOG(Info, "cmodi 103"); auto& program = modelInstance->program; + ET_LOG(Info, "cmodi 104"); // Use the first method in the program. const char* method_name = nullptr; { const auto method_name_result = program->get_method_name(0); + ET_LOG(Info, "cmodi 105"); ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); method_name = *method_name_result; } @@ -524,12 +537,15 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize); modelInstance->method_allocator = std::make_unique( kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data()); + ET_LOG(Info, "cmodi 106"); auto& method_allocator = modelInstance->method_allocator; method_allocator->enable_profiling("method allocator"); auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator + ET_LOG(Info, "cmodi 107"); + size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); for (size_t id = 0; id < num_memory_planned_buffers; ++id) { // .get() will always succeed because id < num_memory_planned_buffers. @@ -539,22 +555,28 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { planned_buffers.push_back(std::make_unique(buffer_size)); planned_spans.push_back({planned_buffers.back().get(), buffer_size}); } + ET_LOG(Info, "cmodi 108"); modelInstance->planned_memory = std::make_unique( Span>{planned_spans.data(), planned_spans.size()}); auto& planned_memory = modelInstance->planned_memory; + ET_LOG(Info, "cmodi 109"); // Assemble all of the allocators into the MemoryManager that the Executor // will use. auto& neuron_allocator = GET_NEURON_ALLOCATOR; + ET_LOG(Info, "cmodi 110"); modelInstance->memory_manager = std::make_unique( method_allocator.get(), planned_memory.get(), dynamic_cast(&neuron_allocator)); + ET_LOG(Info, "cmodi 111"); auto& memory_manager = modelInstance->memory_manager; + ET_LOG(Info, "cmodi 112"); ET_LOG(Debug, "Begin loading method %s", method_name); Result method = program->load_method(method_name, memory_manager.get()); + ET_LOG(Info, "cmodi 113"); ET_CHECK_MSG( method.ok(), "Loading of method %s failed with status 0x%" PRIx32, @@ -563,6 +585,7 @@ void* ModelChunk::CreateModelInstance(const std::string& modelPath) { ET_LOG(Debug, "Method loaded."); modelInstance->method = std::make_unique(std::move(method.get())); + ET_LOG(Info, "cmodi 114"); return modelInstance; } diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp index e0479110a7c..e20eac3b248 100644 --- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp +++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp @@ -21,20 +21,28 @@ namespace torch::executor { template void MultiModelLoader::LoadModels() { // Init empty model instance map + ET_LOG(Info, "cmodi LoadModels() 1"); for (const auto& [id, _] : mModelPathMap) { ET_CHECK_MSG( !HasModel(id), "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 2"); mModelInstanceMap[id] = nullptr; } const size_t numModels = mModelPathMap.size(); + ET_LOG(Info, "cmodi LoadModels() 3"); if (!AllowModelsCoexist()) { + ET_LOG(Info, "cmodi LoadModels() 4"); SelectModel(mDefaultModelId); + ET_LOG(Info, "cmodi LoadModels() 5"); ET_CHECK_MSG( GetModelInstance() == nullptr, "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 6"); void* instance = CreateModelInstance(mModelPathMap[mDefaultModelId]); + ET_LOG(Info, "cmodi LoadModels() 7"); SetModelInstance(instance); + ET_LOG(Info, "cmodi LoadModels() 8"); ET_LOG( Debug, "LoadModels(): Loaded single exclusive model (Total=%zu)", @@ -42,14 +50,20 @@ void MultiModelLoader::LoadModels() { return; } for (const auto& [id, modelPath] : mModelPathMap) { + ET_LOG(Info, "cmodi LoadModels() 9"); SelectModel(id); + ET_LOG(Info, "cmodi LoadModels() 10"); ET_CHECK_MSG( GetModelInstance() == nullptr, "Model is already initialized before calling LoadModels."); + ET_LOG(Info, "cmodi LoadModels() 11"); void* instance = CreateModelInstance(modelPath); + ET_LOG(Info, "cmodi LoadModels() 12"); SetModelInstance(instance); + ET_LOG(Info, "cmodi LoadModels() 13"); } SelectModel(mDefaultModelId); // Select the default instance + ET_LOG(Info, "cmodi LoadModels() 14"); ET_LOG(Debug, "LoadModels(): Loaded multiple models (Total=%zu)", numModels); } @@ -174,4 +188,4 @@ std::string MultiModelLoader::GetIdString(const IdType& id) { template class MultiModelLoader; template class MultiModelLoader; -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h new file mode 100644 index 00000000000..98cd8ab394e --- /dev/null +++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h @@ -0,0 +1,32 @@ +#pragma once + +namespace torch::executor { + using llm_helper::LLMType; + + // Sizes + const size_t PROMPT_TOKEN_BATCH_SIZE = 128; + const size_t CACHE_SIZE = 512; + const size_t HIDDEN_SIZE = 4096; + const size_t NUM_HEAD = 32; + const size_t NUM_LAYER = 32; + const size_t MAX_TOKEN_LENGTH = 8192; + const double ROT_EMB_BASE = 500000; + + // Types + const LLMType MODEL_INPUT_TYPE = LLMType::FP32; + const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32; + const LLMType CACHE_TYPE = LLMType::FP32; + const LLMType MASK_TYPE = LLMType::FP32; + const LLMType ROT_EMB_TYPE = LLMType::FP32; + + // Paths + const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model"; + const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin"; + + // Comma-Separated Paths + const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,"; + + // Comma-Separated Paths + const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,"; + +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index 370695cb773..1193e2b1830 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -147,11 +147,11 @@ LlamaModelOptions get_model_options() { .rot_emb_base = FLAGS_rot_emb_base, // Types - .model_input_type = getLLMTypeFromName(FLAGS_input_type.c_str()), - .model_output_type = getLLMTypeFromName(FLAGS_output_type.c_str()), - .cache_type = getLLMTypeFromName(FLAGS_cache_type.c_str()), - .mask_type = getLLMTypeFromName(FLAGS_mask_type.c_str()), - .rot_emb_type = getLLMTypeFromName(FLAGS_rot_emb_type.c_str())}; + .model_input_type = LLMType::FP32, + .model_output_type = LLMType::FP32, + .cache_type = LLMType::FP32, + .mask_type = LLMType::FP32, + .rot_emb_type = LLMType::FP32}; return options; } @@ -159,8 +159,8 @@ LlamaModelPaths get_model_paths() { LlamaModelPaths model_paths = { .tokenizer_path = FLAGS_tokenizer_path, .token_embedding_path = FLAGS_token_embedding_path, - .prompt_model_paths = utils::split(FLAGS_prompt_model_paths, ','), - .gen_model_paths = utils::split(FLAGS_gen_model_paths, ',')}; + .prompt_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,", ','), + .gen_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,", ',')}; return model_paths; } @@ -353,23 +353,27 @@ int main(int argc, char** argv) { Timer timer_release( [](const auto elapsed_sec) { ET_LOG(Info, "Model released."); }); - LlamaRuntime llama_runtime; + //LlamaRuntime llama_runtime; + std::unique_ptr llama_runtime = std::make_unique(); // Initialize model ET_LOG(Info, "Begin model loading."); timer_init.Start(); const auto tokenizer = load_tokenizer(); - llama_runtime.Initialize(model_options, model_paths); + //llama_runtime.Initialize(model_options, model_paths); + llama_runtime->Initialize(model_options, model_paths); timer_init.End(); // Run model ET_CHECK_MSG(!FLAGS_prompt_file.empty(), "No prompt file provided."); std::string prompt = utils::read_file(FLAGS_prompt_file); - inference(llama_runtime, tokenizer, prompt); + //inference(llama_runtime, tokenizer, prompt); + inference(*llama_runtime.get(), tokenizer, prompt); // Release model timer_release.Start(); - llama_runtime.Release(); + //llama_runtime.Release(); + llama_runtime->Release(); timer_release.End(); return 0; diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp new file mode 100644 index 00000000000..dbb5b79b42c --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -0,0 +1,333 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 MediaTek Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* Copyright Statement: + * + * This software/firmware and related documentation ("MediaTek Software") are + * protected under relevant copyright laws. The information contained herein + * is confidential and proprietary to MediaTek Inc. and/or its licensors. + * Without the prior written permission of MediaTek inc. and/or its licensors, + * any reproduction, modification, use or disclosure of MediaTek Software, + * and information contained herein, in whole or in part, shall be strictly + * prohibited. + */ +/* MediaTek Inc. (C) 2024. All rights reserved. + * + * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES + * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE") + * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON + * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. + * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE + * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR + * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH + * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY + * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY + * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK + * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO + * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN + * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND + * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER + * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT + * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER + * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE. + * + * The following software/firmware and/or related documentation ("MediaTek + * Software") have been modified by MediaTek Inc. All revisions are subject to + * any receiver's applicable license agreements with MediaTek Inc. + */ + +#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h" +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include + +#include "llama_runner/ModelChunk.h" +#include "llama_runner/Utils.h" +#include "llama_runner/llm_helper/include/llm_types.h" +#include "llama_runner/llm_helper/include/llama_runner_values.h" + +static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate. +// Global BOS and EOS option for tokenization (encoding) +static constexpr int8_t kAddBos = 1; +static constexpr int8_t kAddEos = 0; + +using namespace torch::executor; +using namespace torch::executor::llm_helper; +using torch::executor::utils::Timer; + +MTKLlamaRunner::MTKLlamaRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature) + : modeloptions_(get_model_options()), + modelpaths_(get_model_paths()) { + runtime_init(); + ET_LOG( + Info, + "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init()."); +} + +Error MTKLlamaRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + + // Load tokenizer + ET_LOG(Info, "Loading tokenizer."); + tokenizer_ = load_tokenizer(); + ET_LOG(Info, "Complete loading tokenizer."); + + // Load prompt model + runtime_ = std::make_unique(); + ET_LOG(Info, "Loading prompt model."); + runtime_->Initialize(modeloptions_, modelpaths_); + ET_LOG(Info, "Complete loading prompt model."); + + return Error::Ok; +} + +bool MTKLlamaRunner::is_loaded() const { + return tokenizer_ && runtime_; +} + +Error MTKLlamaRunner::generate( + const std::string& prompt, + int32_t seq_len, + std::function token_callback, + std::function stats_callback) { + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + // Wrap the token_callback with print function + std::function wrapped_callback = + [token_callback](const std::string& piece) { + util::safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) { + token_callback(piece); + } + }; + + ET_LOG(Info, "Starting inference from MTKLlamaRunner"); + inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback); + ET_LOG(Info, "Completed inference from MTKLlamaRunner"); + + return Error::Ok; +} + +void MTKLlamaRunner::stop() { + if (is_loaded()) { + runtime_->Release(); + } else { + ET_LOG(Error, "Llama Runtime is not loaded, cannot stop"); + } +} + +LlamaModelOptions MTKLlamaRunner::get_model_options() { + LlamaModelOptions options = { + // Sizes + .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE, + .cache_size = CACHE_SIZE, + .hidden_size = HIDDEN_SIZE, + .num_head = NUM_HEAD, + .num_layer = NUM_LAYER, + .max_token_length = MAX_TOKEN_LENGTH, + .rot_emb_base = ROT_EMB_BASE, + + // Types + .model_input_type = MODEL_INPUT_TYPE, + .model_output_type = MODEL_OUTPUT_TYPE, + .cache_type = CACHE_TYPE, + .mask_type = MASK_TYPE, + .rot_emb_type = ROT_EMB_TYPE}; + ET_LOG(Info, "Completed get_model_options"); + return options; +} + +LlamaModelPaths MTKLlamaRunner::get_model_paths() { + LlamaModelPaths model_paths = { + .tokenizer_path = TOKENIZER_PATH, + .token_embedding_path = TOKEN_EMBEDDING_PATH, + .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','), + .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')}; + ET_LOG(Info, "Completed get_model_paths"); + return model_paths; +} + +Result MTKLlamaRunner::digest_prompt( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::vector input_tokens) { + const auto input_token_count = input_tokens.size(); + const auto prompt_token_batch_size = llama_runtime.GetTokenBatchSize(); + size_t cur_token_index = 0; + + Timer timer_digest_prompt([=](const auto elapsed_sec) { + // Ideal prompt size is a multiple of prompt batch size + const size_t ideal_prompt_size = + std::ceil(float(input_token_count) / prompt_token_batch_size) * + prompt_token_batch_size; + ET_LOG( + Info, + "Done analyzing prompt in %f sec (%f tok/s)", + elapsed_sec, + (float)ideal_prompt_size / elapsed_sec); + }); + + auto getNextTokens = [&]() { + const size_t num_tok_remain = input_token_count - cur_token_index; + const size_t remainder = num_tok_remain % prompt_token_batch_size; + const size_t num_new_tokens = + remainder ? remainder : prompt_token_batch_size; + const auto start = cur_token_index; + const auto end = start + num_new_tokens; + return std::vector( + input_tokens.begin() + start, input_tokens.begin() + end); + }; + + void* logits; + timer_digest_prompt.Start(); + while (cur_token_index < input_token_count) { + const auto next_tokens = getNextTokens(); + ET_LOG( + Debug, + "Digest next tokens (size=%zu), 1st tok=%lu", + next_tokens.size(), + next_tokens[0]); + logits = llama_runtime.Run(next_tokens); + cur_token_index += next_tokens.size(); + } + timer_digest_prompt.End(); + + const auto vocab_size = tokenizer->vocab_size(); + const auto logits_type = llama_runtime.GetModelOptions().model_output_type; + const auto first_output_token = + utils::argmax(logits_type, logits, vocab_size); + return first_output_token; +} + +Error MTKLlamaRunner::gen_response( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const uint64_t input_token, + std::function token_callback) { + Timer timer_model_swap( + [](const auto elapsed_sec) { ET_LOG(Info, "Model swapped."); }); + + // Swap to gen mode + timer_model_swap.Start(); + llama_runtime.SwapModel(1); + timer_model_swap.End(); + + size_t gen_tok_count = 0; + uint64_t prev_token = input_token; + uint64_t output_token = input_token; + + auto decode_res = tokenizer->decode(prev_token, output_token); + ET_CHECK_OR_RETURN_ERROR( + decode_res.ok(), + InvalidState, + "Tokenizer failed to decode first generated token: %lu", + output_token); + std::string full_response = std::move(decode_res.get()); + std::vector full_response_tokens = {input_token}; + + const auto vocab_size = tokenizer->vocab_size(); + const auto logits_type = llama_runtime.GetModelOptions().model_output_type; + + double gen_total_time_sec = 0; + Timer timer_gen_token( + [&](const auto elapsed_sec) { gen_total_time_sec += elapsed_sec; }); + + // Print first output token + token_callback(full_response); + + while (gen_tok_count++ < MAX_RESPONSE && + llama_runtime.GetTokenIndex() < modeloptions_.max_token_length) { + timer_gen_token.Start(); + void* logits = llama_runtime.Run({output_token}); + timer_gen_token.End(); + + prev_token = output_token; + output_token = utils::argmax(logits_type, logits, vocab_size); + full_response_tokens.push_back(output_token); + + // Stop when output is EOS + if (output_token == tokenizer->eos_tok()) { + token_callback(""); + break; + } + auto decode_res = tokenizer->decode(prev_token, output_token); + ET_CHECK_OR_RETURN_ERROR( + decode_res.ok(), + InvalidState, + "Tokenizer failed to decode generated token %lu", + output_token); + const std::string tok_str = std::move(decode_res.get()); + full_response += tok_str; + token_callback(tok_str); + } + + std::cout << "\n\n[Generated Tokens]\n" + << utils::to_string(full_response_tokens) << std::endl; + + ET_LOG( + Info, + "Token generation speed: %f tok/s", + gen_tok_count / gen_total_time_sec); + + return Error::Ok; +} + +Error MTKLlamaRunner::inference( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::string& prompt, + std::function token_callback) { + // Tokenize input prompt + auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos); + ET_CHECK_OR_RETURN_ERROR( + encode_res.ok(), InvalidState, "Tokenizer failed to encode prompt"); + const auto input_tokens = std::move(encode_res.get()); + + // Run prompt mode (pre-fill) + auto prefill_res = digest_prompt(llama_runtime, tokenizer, input_tokens); + ET_CHECK_OR_RETURN_ERROR( + prefill_res.ok(), InvalidState, "Failed to digest prompt"); + const auto first_output_token = prefill_res.get(); + + // run generation mode (decoding) + return gen_response(llama_runtime, tokenizer, first_output_token, token_callback); +} + +std::unique_ptr MTKLlamaRunner::load_tokenizer() { + std::unique_ptr tokenizer; + // Assumes that tokenizer type is Tiktoken + tokenizer = torch::executor::get_tiktoken_for_llama(); + tokenizer->load(modelpaths_.tokenizer_path); + return tokenizer; +} diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h new file mode 100644 index 00000000000..d9f85c20257 --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_llama_runner.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama_runner/LlamaConfig.h" +#include "llama_runner/LlamaRuntime.h" +using namespace torch::executor; +using Stats = ::executorch::llm::Stats; + +class MTKLlamaRunner { + public: + explicit MTKLlamaRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature = 0.8f); + + bool is_loaded() const; + Error load(); + Error generate( + const std::string& prompt, + int32_t seq_len = 128, + std::function token_callback = {}, + std::function stats_callback = {}); + void stop(); + + LlamaModelOptions get_model_options(); + LlamaModelPaths get_model_paths(); + Result digest_prompt( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::vector input_tokens); + Error gen_response( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const uint64_t input_token, + std::function token_callback); + Error inference( + LlamaRuntime& llama_runtime, + const std::unique_ptr& tokenizer, + const std::string& prompt, + std::function token_callback); + std::unique_ptr load_tokenizer(); + + + private: + // model + const torch::executor::LlamaModelOptions modeloptions_; + const torch::executor::LlamaModelPaths modelpaths_; + std::unique_ptr tokenizer_; + std::unique_ptr runtime_; +}; diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index ab1f3650102..9a1a14b113a 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -94,6 +94,26 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) ${EXECUTORCH_ROOT}/examples/models/llama2/runner ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner ) + + target_sources( + executorch_jni PRIVATE + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp + ) + target_include_directories( + executorch_jni PRIVATE + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/ + ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner + ) + ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED) + SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so) + list(APPEND link_libraries neuron_backend libneuron_buffer_allocator) endif() if(TARGET quantized_kernels) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index e6a9b5de58c..50476df5690 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -68,13 +69,15 @@ class ExecuTorchLlamaJni int model_type_category_; std::unique_ptr runner_; std::unique_ptr multi_modal_runner_; + std::unique_ptr mtk_llama_runner_; public: constexpr static auto kJavaDescriptor = "Lorg/pytorch/executorch/LlamaModule;"; - constexpr static int MODEL_TYPE_CATEGORY_LLM = 1; + constexpr static int MODEL_TYPE_CATEGORY_LLM = 3 /* should be put back to 1*/; constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2; + constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 1 /* 3 */; static facebook::jni::local_ref initHybrid( facebook::jni::alias_ref, @@ -113,6 +116,11 @@ class ExecuTorchLlamaJni model_path->toStdString().c_str(), tokenizer_path->toStdString().c_str(), temperature); + } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_ = std::make_unique( + model_path->toStdString().c_str(), + tokenizer_path->toStdString().c_str(), + temperature); } } @@ -152,6 +160,12 @@ class ExecuTorchLlamaJni [callback](std::string result) { callback->onResult(result); }, [callback](const Stats& result) { callback->onStats(result); }, echo); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_->generate( + prompt->toStdString(), + seq_len, + [callback](std::string result) { callback->onResult(result); }, + [callback](const Stats& result) { callback->onStats(result); }); } return 0; } @@ -243,6 +257,8 @@ class ExecuTorchLlamaJni multi_modal_runner_->stop(); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { runner_->stop(); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + mtk_llama_runner_->stop(); } } @@ -251,6 +267,8 @@ class ExecuTorchLlamaJni return static_cast(multi_modal_runner_->load()); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { return static_cast(runner_->load()); + } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) { + return static_cast(mtk_llama_runner_->load()); } return static_cast(Error::InvalidArgument); }