diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 886b25e4221..0c3550f151a 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -122,7 +121,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { float temperature_ = 0.0f; int model_type_category_; std::unique_ptr runner_; - std::unique_ptr multi_modal_runner_; + std::unique_ptr + multi_modal_runner_; + std::vector prefill_inputs_; public: constexpr static auto kJavaDescriptor = @@ -168,10 +169,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { model_type_category_ = model_type_category; if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) { - multi_modal_runner_ = std::make_unique( + multi_modal_runner_ = llm::create_multimodal_runner( model_path->toStdString().c_str(), - tokenizer_path->toStdString().c_str(), - temperature); + llm::load_tokenizer(tokenizer_path->toStdString())); } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) { std::optional data_path_str = data_path ? std::optional{data_path->toStdString()} @@ -217,6 +217,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref callback, jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { + std::vector inputs = prefill_inputs_; + prefill_inputs_.clear(); + inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()}); auto image_size = image->size(); std::vector images; if (image_size != 0) { @@ -227,15 +230,18 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { image_data[i] = image_data_jint[i]; } llm::Image image_runner{image_data, width, height, channels}; - images.push_back(image_runner); + inputs.emplace_back(llm::MultimodalInput{std::move(image_runner)}); } + executorch::extension::llm::GenerationConfig config{ + .echo = static_cast(echo), + .seq_len = seq_len, + .temperature = temperature_, + }; multi_modal_runner_->generate( - std::move(images), - prompt->toStdString(), - seq_len, - [callback](std::string result) { callback->onResult(result); }, - [callback](const llm::Stats& result) { callback->onStats(result); }, - echo); + std::move(inputs), + config, + [callback](const std::string& result) { callback->onResult(result); }, + [callback](const llm::Stats& result) { callback->onStats(result); }); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { executorch::extension::llm::GenerationConfig config{ .echo = static_cast(echo), @@ -259,19 +265,10 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { jlong start_pos, jint bos, jint eos) { + prefill_inputs_.emplace_back(llm::MultimodalInput{prompt->toStdString()}); facebook::jni::local_ref tuple_result = facebook::jni::make_long_array(2); - if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { - tuple_result->pin()[0] = static_cast(Error::NotSupported); - return tuple_result; - } - - auto&& result = multi_modal_runner_->prefill_prompt( - prompt->toStdString(), start_pos, bos, eos); tuple_result->pin()[0] = static_cast(Error::Ok); - if (result.ok()) { - tuple_result->pin()[1] = static_cast(start_pos); - } return tuple_result; } @@ -285,16 +282,8 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { jint height, jint channels, jlong start_pos) { - facebook::jni::local_ref tuple_result = - facebook::jni::make_long_array(2); - - if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { - tuple_result->pin()[0] = static_cast(Error::NotSupported); - return tuple_result; - } - - auto image_size = image->size(); std::vector images; + auto image_size = image->size(); if (image_size != 0) { std::vector image_data_jint(image_size); std::vector image_data(image_size); @@ -303,13 +292,14 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { image_data[i] = image_data_jint[i]; } llm::Image image_runner{image_data, width, height, channels}; - images.push_back(image_runner); + prefill_inputs_.emplace_back( + llm::MultimodalInput{std::move(image_runner)}); } - // TODO(hsz): make start_pos a reference and update it here - jint result = static_cast( - multi_modal_runner_->prefill_images(images, start_pos)); - tuple_result->pin()[0] = result; - tuple_result->pin()[1] = static_cast(start_pos); + + facebook::jni::local_ref tuple_result = + facebook::jni::make_long_array(2); + + tuple_result->pin()[0] = static_cast(Error::Ok); return tuple_result; } @@ -320,13 +310,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref callback, jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { - return static_cast(multi_modal_runner_->generate_from_pos( - prompt->toStdString(), - seq_len, - start_pos, + std::vector inputs = prefill_inputs_; + prefill_inputs_.clear(); + inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()}); + return static_cast(multi_modal_runner_->generate( + inputs, + llm::GenerationConfig{ + .echo = static_cast(echo), .seq_len = seq_len}, [callback](const std::string& result) { callback->onResult(result); }, - [callback](const llm::Stats& stats) { callback->onStats(stats); }, - echo)); + [callback](const llm::Stats& stats) { callback->onStats(stats); })); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { executorch::extension::llm::GenerationConfig config{ .echo = static_cast(echo), diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index f12de5f1d87..c5ce85fcf9b 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -111,9 +111,9 @@ get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module) { if (!method_names.count(llm::kMaxSeqLen)) { ET_LOG( Error, - "Required metadata method %s not found in model", + "Required metadata method %s not found in model. Bypass", llm::kMaxSeqLen); - return ::executorch::runtime::Error::InvalidArgument; + // return ::executorch::runtime::Error::InvalidArgument; } for (auto& pair : metadata) { diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 1d9a0c8fdfc..60039708d01 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -37,6 +37,8 @@ MultimodalPrefiller::MultimodalPrefiller( Result MultimodalPrefiller::prefill( const MultimodalInput& input, int64_t& start_pos) { + ET_LOG(Error, "Here 000000000000000000000000000000000000000000000000000000000000000000 %d", (int) (start_pos)); + ET_LOG(Error, "Here 000000000000000000000000000000000000000000000000000000000000000000 %s", input.get_text().c_str()); // 1. Run encoder model. ::executorch::runtime::EValue encoder_output; if (input.is_image()) { @@ -73,12 +75,14 @@ Result MultimodalPrefiller::prefill( auto& text = input.get_text(); std::vector tokens = ET_UNWRAP_TOKENIZER(tokenizer_->encode(text)); + ET_LOG(Error, "Here 666666666666666666666666666666666666666666666666666666666666666666666666666666"); auto text_tensor = executorch::extension::from_blob( tokens.data(), {1, static_cast(tokens.size())}, ::executorch::aten::ScalarType::Long); + ET_LOG(Error, "Here 555555555555555555555555555555555555555555555555555555555555555555555555555"); // Run text encoder (token embeddings) auto token_embedding_outputs = ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor)); @@ -89,6 +93,7 @@ Result MultimodalPrefiller::prefill( // For any other input types, return error return ::executorch::runtime::Error::NotSupported; } + ET_LOG(Error, "Here 000000000000000000000000000000000000000000000000000000000000000000000000000"); // 2. Run decoder model for prefill. // `cache_position` goes from start_pos to start_pos + encoder_output.size(1). @@ -107,6 +112,7 @@ Result MultimodalPrefiller::prefill( cache_positions.data(), {static_cast(seq_len)}, executorch::aten::ScalarType::Long); + ET_LOG(Error, "Here 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111"); auto prefill_result = module_->execute( kTextModelMethod, {cache_position_tensor, encoder_output}); if (prefill_result.error() != ::executorch::runtime::Error::Ok) { @@ -121,6 +127,7 @@ Result MultimodalPrefiller::prefill( return ::executorch::runtime::Error::InvalidState; } auto outputs_res = prefill_outputs[0].toTensor(); + ET_LOG(Error, "Here 222222222222222222222222222222222222222222222222222222222222222222222222222222"); // Update start_pos, tracking the current cache position. start_pos += seq_len; diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index f6b29d42c09..2c82f4ee7ea 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -57,9 +57,9 @@ Error MultimodalRunner::load() { // Don't print with the same priority during warmup #define RUNNER_ET_LOG(warmup, format, ...) \ if (warmup) { \ - ET_LOG(Debug, format, __VA_ARGS__); \ + ET_LOG(Error, format, __VA_ARGS__); \ } else { \ - ET_LOG(Info, format, __VA_ARGS__); \ + ET_LOG(Error, format, __VA_ARGS__); \ } Error MultimodalRunner::generate( @@ -104,9 +104,12 @@ Error MultimodalRunner::generate( uint64_t prefill_next_token = 0; // Process multimodal inputs in order + ET_LOG(Error, "0000000000000000000000000000000000000000000000000000SIZE%d", inputs.size()); for (const MultimodalInput& input : inputs) { + ET_LOG(Error, "00000000000000000000000000000000123321451345143100"); prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_)); } + ET_LOG(Error, "1111111111111111111111111111111111111111111111111111"); stats_->first_token_ms = time_in_ms(); stats_->prompt_eval_end_ms = time_in_ms(); @@ -114,6 +117,7 @@ Error MultimodalRunner::generate( wrapped_callback(ET_UNWRAP_TOKENIZER( tokenizer_->decode(prefill_next_token, prefill_next_token))); + ET_LOG(Info, "2222222222222222222222222222222222222222222222222222"); RUNNER_ET_LOG( config.warming,