From 154850e3ded826cf7109cbd5ca57a93ecaa7d7e8 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 16 Sep 2025 16:40:26 -0700 Subject: [PATCH 1/3] [multimodal] Allow float32 image input Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- examples/models/llava/main.cpp | 16 +- extension/llm/runner/image.h | 103 +++++++- extension/llm/runner/multimodal_prefiller.h | 245 ++++++++++++++++---- 3 files changed, 302 insertions(+), 62 deletions(-) diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 6cb84aa088e..3946a629ade 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -81,24 +81,20 @@ void load_image(const std::string& image_path, Image& image) { new_height, 0, channels); - // transpose to CHW - image.data.resize(channels * new_width * new_height); + std::vector chw_data(channels * new_width * new_height); for (int i = 0; i < new_width * new_height; ++i) { for (int c = 0; c < channels; ++c) { - image.data[c * new_width * new_height + i] = - resized_data[i * channels + c]; + chw_data[c * new_width * new_height + i] = resized_data[i * channels + c]; } } - image.width = new_width; - image.height = new_height; - image.channels = channels; + image = Image(std::move(chw_data), new_width, new_height, channels); // convert to tensor ET_LOG( Info, "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32, - image.channels, - image.height, - image.width); + image.channels(), + image.height(), + image.width()); stbi_image_free(data); } diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h index 67fb8939518..dbdba273536 100644 --- a/extension/llm/runner/image.h +++ b/extension/llm/runner/image.h @@ -10,19 +10,112 @@ #pragma once #include +#include #include +#include #include +#include +#include + namespace executorch { namespace extension { namespace llm { -struct ET_EXPERIMENTAL Image { +class ET_EXPERIMENTAL Image { + public: + // Default constructor + Image() : width_(0), height_(0), channels_(0) {} + + // Constructor for uint8_t data + Image( + std::vector&& data, + int32_t width, + int32_t height, + int32_t channels) + : data_(std::move(data)), + width_(width), + height_(height), + channels_(channels) {} + + // Constructor for float data + Image( + std::vector&& data, + int32_t width, + int32_t height, + int32_t channels) + : data_(std::move(data)), + width_(width), + height_(height), + channels_(channels) {} + + // Getters + int32_t width() const { + return width_; + } + int32_t height() const { + return height_; + } + int32_t channels() const { + return channels_; + } + + // Data access + bool is_uint8() const { + return std::holds_alternative>(data_); + } + + bool is_float() const { + return std::holds_alternative>(data_); + } + + const std::vector& get_uint8_data() const& { + return std::get>(data_); + } + + std::vector& get_uint8_data() & { + return std::get>(data_); + } + + const std::vector& get_float_data() const& { + return std::get>(data_); + } + + std::vector& get_float_data() & { + return std::get>(data_); + } + + executorch::runtime::Result toTensor( + bool with_batch = false) const { + // Note: This creates a 3D tensor (CHW). The model might expect a 4D + // tensor (NCHW). The caller should handle reshaping if needed. + std::vector sizes = { + channels(), height(), width()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); + } + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, "Image data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; + } + + private: // Assuming NCHW format - std::vector data; - int32_t width; - int32_t height; - int32_t channels; + std::variant, std::vector> data_; + int32_t width_; + int32_t height_; + int32_t channels_; }; } // namespace llm diff --git a/extension/llm/runner/multimodal_prefiller.h b/extension/llm/runner/multimodal_prefiller.h index dbfa2ec7ca3..3f8777d4acf 100644 --- a/extension/llm/runner/multimodal_prefiller.h +++ b/extension/llm/runner/multimodal_prefiller.h @@ -6,56 +6,207 @@ * LICENSE file in the root directory of this source tree. */ -// Generic encoder prefiller that handles multimodal inputs (image and audio) -// to prefill the KV cache of a multimodal LLM. +// Generic encoder prefiller that handles multimodal inputs (text, image and +// audio (to be implemented)) to prefill the KV cache of a multimodal LLM. +// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated -#pragma once - -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include namespace executorch::extension::llm { -using runtime::Error; -using runtime::Result; -using tokenizers::Tokenizer; - -// Assuming kv cache and parallel prefill are enabled. -// This prefiller supports both image and audio inputs -class ET_EXPERIMENTAL MultimodalPrefiller { - public: - explicit MultimodalPrefiller( - Module* module, - MultimodalDecoderRunner* decoder_runner, - Tokenizer* tokenizer, - IOManager* io_manager); - - /** - * Prefill an LLM Module with the given multimodal input. - * @param input The multimodal input (image or audio) to the multimodal LLM. - * @param start_pos The starting position in KV cache of the input in the LLM. - * It's passed as reference and will be updated inside this function. - * @return The next token of the LLM Module after prefill. - */ - virtual Result prefill( - const MultimodalInput& input, - int64_t& start_pos); - - virtual Error load(); - virtual bool is_method_loaded(); - - virtual ~MultimodalPrefiller() = default; - - protected: - Module* module_; - MultimodalDecoderRunner* text_decoder_runner_; - Tokenizer* tokenizer_; - IOManager* io_manager_; -}; +MultimodalPrefiller::MultimodalPrefiller( + Module* module, + MultimodalDecoderRunner* decoder_runner, + Tokenizer* tokenizer, + IOManager* io_manager) + : module_(module), + text_decoder_runner_(decoder_runner), + tokenizer_(tokenizer), + io_manager_(io_manager) {} + +/** + * Prefill an LLM Module with the given multimodal input. + * @param input The multimodal input (text, image or audio) to the multimodal + * LLM. + * @param start_pos The starting position in KV cache of the input in the LLM + * @return logits of the prefill. + */ +Result MultimodalPrefiller::prefill( + const MultimodalInput& input, + int64_t& start_pos) { + // 1. Run encoder model. + ::executorch::runtime::EValue encoder_output; + if (input.is_image()) { + Image image = input.get_image(); + + auto method_meta = ET_UNWRAP( + module_->method_meta(kImageEncoderMethod), + "Failed to get method_meta for %s", + kImageEncoderMethod); + + ET_CHECK_MSG( + method_meta.num_inputs() > 0, + "Image encoder should have at least 1 input"); + auto input_meta = ET_UNWRAP( + method_meta.input_tensor_meta(0), + "Cannot get input tensor meta at index 0"); + auto expected_dtype = input_meta.scalar_type(); + + if (expected_dtype == ::executorch::aten::ScalarType::Float) { + ET_CHECK_MSG( + image.is_float(), + "Model expects float image data, but image has uint8_t data."); + } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { + ET_CHECK_MSG( + image.is_uint8(), + "Model expects uint8_t image data, but image has float data."); + } else { + ET_LOG( + Error, + "Unsupported image encoder input dtype: %s", + ::executorch::runtime::toString(expected_dtype)); + return ::executorch::runtime::Error::NotSupported; + } + + // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D + // tensor (CHW). Add a batch dimension of 1 if needed. + auto expected_dims = input_meta.sizes(); + auto image_tensor = ET_UNWRAP( + image.toTensor(/*with_batch*/ expected_dims.size() == 4), + "Failed to convert image to tensor"); + + // Run image encoder + auto image_encoder_outputs = + ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); + + encoder_output = image_encoder_outputs[0]; + } else if (input.is_audio()) { + Audio audio = input.get_audio(); + + // Use the original tensor shape as intended + auto audio_tensor = executorch::extension::from_blob( + audio.data.data(), + {audio.batch_size, audio.n_bins, audio.n_frames}, + ::executorch::aten::ScalarType::Float); + + // Run audio encoder + auto audio_encoder_result = + module_->execute(kAudioEncoderMethod, audio_tensor); + if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) { + return ::executorch::runtime::Error::Internal; + } + auto audio_encoder_outputs = audio_encoder_result.get(); + + encoder_output = audio_encoder_outputs[0]; + } else if (input.is_text()) { + auto& text = input.get_text(); + std::vector tokens = + ET_UNWRAP_TOKENIZER(tokenizer_->encode(text)); + + auto text_tensor = executorch::extension::from_blob( + tokens.data(), + {1, static_cast(tokens.size())}, + ::executorch::aten::ScalarType::Long); + + // Run text encoder (token embeddings) + auto token_embedding_outputs = + ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor)); + + encoder_output = token_embedding_outputs[0]; + } else { + ET_LOG(Error, "Unsupported input type"); + // For any other input types, return error + return ::executorch::runtime::Error::NotSupported; + } + + // 2. Run decoder model for prefill. + + // Get expected shape of cache position tensor, which should be the second + // argument + + int64_t seq_len = encoder_output.toTensor().size(1); + if (seq_len == 0) { + ET_LOG(Error, "The encoder returned an empty output."); + return ::executorch::runtime::Error::InvalidState; + } + std::vector cache_positions; + + auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, seq_len, kTextModelMethod)); + + auto prefill_result = module_->execute( + kTextModelMethod, {encoder_output, cache_position_tensor}); + if (prefill_result.error() != ::executorch::runtime::Error::Ok) { + return prefill_result.error(); + } + // Check if prefill_outputs is empty, if it is return error and log that the + // specified encoder returned empty results when used to prefill decoder. + auto prefill_outputs = prefill_result.get(); + if (prefill_outputs.empty()) { + ET_LOG( + Error, "Encoder returned empty results when used to prefill decoder"); + return ::executorch::runtime::Error::InvalidState; + } + auto outputs_res = prefill_outputs[0].toTensor(); + + // Update start_pos, tracking the current cache position. + start_pos += seq_len; + + return static_cast( + text_decoder_runner_->logits_to_token(outputs_res)); +} + +/** + * Load the Module for encoder prefill purpose. + * @return The error code. + */ +::executorch::runtime::Error MultimodalPrefiller::load() { + if (is_method_loaded()) { + return ::executorch::runtime::Error::Ok; + } + // token_embeddings and text_model have to show up in method names. + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod)); + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod)); + + std::unordered_set methods = + ET_UNWRAP(module_->method_names(), "Failed to get method names"); + + // Load image_encoder method if exists. + if (methods.find(kImageEncoderMethod) != methods.end()) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod)); + } + + if (methods.find(kAudioEncoderMethod) != methods.end()) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod)); + } + + return ::executorch::runtime::Error::Ok; +} + +/** + * Check if the required methods in the Module is loaded. + * @return True if the Module is loaded, false otherwise. + */ +bool MultimodalPrefiller::is_method_loaded() { + ::executorch::runtime::Result> methods_res = + module_->method_names(); + if (!module_->is_method_loaded(kTokenEmbeddingMethod)) { + return false; + } + if (!module_->is_method_loaded(kTextModelMethod)) { + return false; + } + if (methods_res.error() != ::executorch::runtime::Error::Ok) { + ET_CHECK_MSG(false, "Failed to get method names"); + } + std::unordered_set methods = methods_res.get(); + if (methods.find(kImageEncoderMethod) != methods.end()) { + return module_->is_method_loaded(kImageEncoderMethod); + } + return true; +} } // namespace executorch::extension::llm From 78fa105a9109c39a997872de93faccc708323c6e Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 16 Sep 2025 16:44:01 -0700 Subject: [PATCH 2/3] Fix --- extension/llm/runner/multimodal_prefiller.cpp | 40 ++- extension/llm/runner/multimodal_prefiller.h | 245 ++++-------------- 2 files changed, 83 insertions(+), 202 deletions(-) diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 2705a9eadff..3f8777d4acf 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -41,10 +41,42 @@ Result MultimodalPrefiller::prefill( ::executorch::runtime::EValue encoder_output; if (input.is_image()) { Image image = input.get_image(); - auto image_tensor = executorch::extension::from_blob( - image.data.data(), - {3, image.height, image.width}, - ::executorch::aten::ScalarType::Byte); + + auto method_meta = ET_UNWRAP( + module_->method_meta(kImageEncoderMethod), + "Failed to get method_meta for %s", + kImageEncoderMethod); + + ET_CHECK_MSG( + method_meta.num_inputs() > 0, + "Image encoder should have at least 1 input"); + auto input_meta = ET_UNWRAP( + method_meta.input_tensor_meta(0), + "Cannot get input tensor meta at index 0"); + auto expected_dtype = input_meta.scalar_type(); + + if (expected_dtype == ::executorch::aten::ScalarType::Float) { + ET_CHECK_MSG( + image.is_float(), + "Model expects float image data, but image has uint8_t data."); + } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { + ET_CHECK_MSG( + image.is_uint8(), + "Model expects uint8_t image data, but image has float data."); + } else { + ET_LOG( + Error, + "Unsupported image encoder input dtype: %s", + ::executorch::runtime::toString(expected_dtype)); + return ::executorch::runtime::Error::NotSupported; + } + + // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D + // tensor (CHW). Add a batch dimension of 1 if needed. + auto expected_dims = input_meta.sizes(); + auto image_tensor = ET_UNWRAP( + image.toTensor(/*with_batch*/ expected_dims.size() == 4), + "Failed to convert image to tensor"); // Run image encoder auto image_encoder_outputs = diff --git a/extension/llm/runner/multimodal_prefiller.h b/extension/llm/runner/multimodal_prefiller.h index 3f8777d4acf..dbfa2ec7ca3 100644 --- a/extension/llm/runner/multimodal_prefiller.h +++ b/extension/llm/runner/multimodal_prefiller.h @@ -6,207 +6,56 @@ * LICENSE file in the root directory of this source tree. */ -// Generic encoder prefiller that handles multimodal inputs (text, image and -// audio (to be implemented)) to prefill the KV cache of a multimodal LLM. -// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated +// Generic encoder prefiller that handles multimodal inputs (image and audio) +// to prefill the KV cache of a multimodal LLM. -#include -#include -#include -#include +#pragma once -namespace executorch::extension::llm { - -MultimodalPrefiller::MultimodalPrefiller( - Module* module, - MultimodalDecoderRunner* decoder_runner, - Tokenizer* tokenizer, - IOManager* io_manager) - : module_(module), - text_decoder_runner_(decoder_runner), - tokenizer_(tokenizer), - io_manager_(io_manager) {} - -/** - * Prefill an LLM Module with the given multimodal input. - * @param input The multimodal input (text, image or audio) to the multimodal - * LLM. - * @param start_pos The starting position in KV cache of the input in the LLM - * @return logits of the prefill. - */ -Result MultimodalPrefiller::prefill( - const MultimodalInput& input, - int64_t& start_pos) { - // 1. Run encoder model. - ::executorch::runtime::EValue encoder_output; - if (input.is_image()) { - Image image = input.get_image(); - - auto method_meta = ET_UNWRAP( - module_->method_meta(kImageEncoderMethod), - "Failed to get method_meta for %s", - kImageEncoderMethod); - - ET_CHECK_MSG( - method_meta.num_inputs() > 0, - "Image encoder should have at least 1 input"); - auto input_meta = ET_UNWRAP( - method_meta.input_tensor_meta(0), - "Cannot get input tensor meta at index 0"); - auto expected_dtype = input_meta.scalar_type(); - - if (expected_dtype == ::executorch::aten::ScalarType::Float) { - ET_CHECK_MSG( - image.is_float(), - "Model expects float image data, but image has uint8_t data."); - } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { - ET_CHECK_MSG( - image.is_uint8(), - "Model expects uint8_t image data, but image has float data."); - } else { - ET_LOG( - Error, - "Unsupported image encoder input dtype: %s", - ::executorch::runtime::toString(expected_dtype)); - return ::executorch::runtime::Error::NotSupported; - } - - // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D - // tensor (CHW). Add a batch dimension of 1 if needed. - auto expected_dims = input_meta.sizes(); - auto image_tensor = ET_UNWRAP( - image.toTensor(/*with_batch*/ expected_dims.size() == 4), - "Failed to convert image to tensor"); - - // Run image encoder - auto image_encoder_outputs = - ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); - - encoder_output = image_encoder_outputs[0]; - } else if (input.is_audio()) { - Audio audio = input.get_audio(); - - // Use the original tensor shape as intended - auto audio_tensor = executorch::extension::from_blob( - audio.data.data(), - {audio.batch_size, audio.n_bins, audio.n_frames}, - ::executorch::aten::ScalarType::Float); - - // Run audio encoder - auto audio_encoder_result = - module_->execute(kAudioEncoderMethod, audio_tensor); - if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) { - return ::executorch::runtime::Error::Internal; - } - auto audio_encoder_outputs = audio_encoder_result.get(); - - encoder_output = audio_encoder_outputs[0]; - } else if (input.is_text()) { - auto& text = input.get_text(); - std::vector tokens = - ET_UNWRAP_TOKENIZER(tokenizer_->encode(text)); - - auto text_tensor = executorch::extension::from_blob( - tokens.data(), - {1, static_cast(tokens.size())}, - ::executorch::aten::ScalarType::Long); - - // Run text encoder (token embeddings) - auto token_embedding_outputs = - ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor)); +#include +#include +#include +#include +#include +#include +#include - encoder_output = token_embedding_outputs[0]; - } else { - ET_LOG(Error, "Unsupported input type"); - // For any other input types, return error - return ::executorch::runtime::Error::NotSupported; - } - - // 2. Run decoder model for prefill. - - // Get expected shape of cache position tensor, which should be the second - // argument - - int64_t seq_len = encoder_output.toTensor().size(1); - if (seq_len == 0) { - ET_LOG(Error, "The encoder returned an empty output."); - return ::executorch::runtime::Error::InvalidState; - } - std::vector cache_positions; - - auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( - module_, start_pos, cache_positions, seq_len, kTextModelMethod)); - - auto prefill_result = module_->execute( - kTextModelMethod, {encoder_output, cache_position_tensor}); - if (prefill_result.error() != ::executorch::runtime::Error::Ok) { - return prefill_result.error(); - } - // Check if prefill_outputs is empty, if it is return error and log that the - // specified encoder returned empty results when used to prefill decoder. - auto prefill_outputs = prefill_result.get(); - if (prefill_outputs.empty()) { - ET_LOG( - Error, "Encoder returned empty results when used to prefill decoder"); - return ::executorch::runtime::Error::InvalidState; - } - auto outputs_res = prefill_outputs[0].toTensor(); - - // Update start_pos, tracking the current cache position. - start_pos += seq_len; - - return static_cast( - text_decoder_runner_->logits_to_token(outputs_res)); -} - -/** - * Load the Module for encoder prefill purpose. - * @return The error code. - */ -::executorch::runtime::Error MultimodalPrefiller::load() { - if (is_method_loaded()) { - return ::executorch::runtime::Error::Ok; - } - // token_embeddings and text_model have to show up in method names. - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod)); - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod)); - - std::unordered_set methods = - ET_UNWRAP(module_->method_names(), "Failed to get method names"); - - // Load image_encoder method if exists. - if (methods.find(kImageEncoderMethod) != methods.end()) { - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod)); - } - - if (methods.find(kAudioEncoderMethod) != methods.end()) { - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod)); - } - - return ::executorch::runtime::Error::Ok; -} +namespace executorch::extension::llm { -/** - * Check if the required methods in the Module is loaded. - * @return True if the Module is loaded, false otherwise. - */ -bool MultimodalPrefiller::is_method_loaded() { - ::executorch::runtime::Result> methods_res = - module_->method_names(); - if (!module_->is_method_loaded(kTokenEmbeddingMethod)) { - return false; - } - if (!module_->is_method_loaded(kTextModelMethod)) { - return false; - } - if (methods_res.error() != ::executorch::runtime::Error::Ok) { - ET_CHECK_MSG(false, "Failed to get method names"); - } - std::unordered_set methods = methods_res.get(); - if (methods.find(kImageEncoderMethod) != methods.end()) { - return module_->is_method_loaded(kImageEncoderMethod); - } - return true; -} +using runtime::Error; +using runtime::Result; +using tokenizers::Tokenizer; + +// Assuming kv cache and parallel prefill are enabled. +// This prefiller supports both image and audio inputs +class ET_EXPERIMENTAL MultimodalPrefiller { + public: + explicit MultimodalPrefiller( + Module* module, + MultimodalDecoderRunner* decoder_runner, + Tokenizer* tokenizer, + IOManager* io_manager); + + /** + * Prefill an LLM Module with the given multimodal input. + * @param input The multimodal input (image or audio) to the multimodal LLM. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @return The next token of the LLM Module after prefill. + */ + virtual Result prefill( + const MultimodalInput& input, + int64_t& start_pos); + + virtual Error load(); + virtual bool is_method_loaded(); + + virtual ~MultimodalPrefiller() = default; + + protected: + Module* module_; + MultimodalDecoderRunner* text_decoder_runner_; + Tokenizer* tokenizer_; + IOManager* io_manager_; +}; } // namespace executorch::extension::llm From 4d6d3be13bf555684c6dc73672479cabfc875d04 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 16 Sep 2025 23:50:11 -0700 Subject: [PATCH 3/3] Fix android and ios usages of image --- extension/android/jni/jni_layer_llama.cpp | 2 +- .../Exported/ExecuTorchLLMMultimodalRunner.mm | 12 +- .../llm/runner/test/test_multimodal_input.cpp | 133 ++++++++---------- 3 files changed, 65 insertions(+), 82 deletions(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 23686f01ee7..cabf30c42e4 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { for (int i = 0; i < image_size; i++) { image_data[i] = image_data_jint[i]; } - llm::Image image_runner{image_data, width, height, channels}; + llm::Image image_runner{std::move(image_data), width, height, channels}; prefill_inputs_.emplace_back( llm::MultimodalInput{std::move(image_runner)}); } diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm index dcc5dc98806..b95e480aded 100644 --- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm +++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm @@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray *)inputs case ExecuTorchLLMMultimodalInputTypeImage: { ExecuTorchLLMImage *image = input.image; std::vector data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length); - nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{ - .data = std::move(data), - .width = (int32_t)image.width, - .height = (int32_t)image.height, - .channels = (int32_t)image.channels - })); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( + std::move(data), + (int32_t)image.width, + (int32_t)image.height, + (int32_t)image.channels + ))); break; } default: { diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp index 97b9cc1379e..486515175e8 100644 --- a/extension/llm/runner/test/test_multimodal_input.cpp +++ b/extension/llm/runner/test/test_multimodal_input.cpp @@ -16,7 +16,6 @@ using executorch::extension::llm::make_image_input; using executorch::extension::llm::make_text_input; using executorch::extension::llm::MultimodalInput; -namespace { class MultimodalInputTest : public Test { protected: std::string createTestText() { @@ -28,21 +27,13 @@ class MultimodalInputTest : public Test { } Image createTestImage() { - Image img; - img.width = 224; - img.height = 224; - img.channels = 3; - img.data = std::vector(224 * 224 * 3, 128); // Fill with gray - return img; + std::vector data(224 * 224 * 3, 128); // Fill with gray + return Image(std::move(data), 224, 224, 3); } Image createTestImageSmall() { - Image img; - img.width = 32; - img.height = 32; - img.channels = 1; - img.data = std::vector(32 * 32, 255); // Fill with white - return img; + std::vector data(32 * 32, 255); // Fill with white + return Image(std::move(data), 32, 32, 1); } }; @@ -76,28 +67,28 @@ TEST_F(MultimodalInputTest, ImageConstructorFromImage) { EXPECT_FALSE(input.is_text()); EXPECT_TRUE(input.is_image()); EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE); - EXPECT_EQ(input.get_image().width, 224); - EXPECT_EQ(input.get_image().height, 224); - EXPECT_EQ(input.get_image().channels, 3); - EXPECT_EQ(input.get_image().data.size(), 224 * 224 * 3); + EXPECT_EQ(input.get_image().width(), 224); + EXPECT_EQ(input.get_image().height(), 224); + EXPECT_EQ(input.get_image().channels(), 3); + EXPECT_EQ(input.get_image().get_uint8_data().size(), 224 * 224 * 3); } TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) { Image img = createTestImage(); - int width = img.width; - int height = img.height; - int channels = img.channels; - size_t data_size = img.data.size(); + int width = img.width(); + int height = img.height(); + int channels = img.channels(); + size_t data_size = img.get_uint8_data().size(); MultimodalInput input(std::move(img)); EXPECT_FALSE(input.is_text()); EXPECT_TRUE(input.is_image()); EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE); - EXPECT_EQ(input.get_image().width, width); - EXPECT_EQ(input.get_image().height, height); - EXPECT_EQ(input.get_image().channels, channels); - EXPECT_EQ(input.get_image().data.size(), data_size); + EXPECT_EQ(input.get_image().width(), width); + EXPECT_EQ(input.get_image().height(), height); + EXPECT_EQ(input.get_image().channels(), channels); + EXPECT_EQ(input.get_image().get_uint8_data().size(), data_size); } // Test copy constructor and assignment @@ -129,10 +120,10 @@ TEST_F(MultimodalInputTest, CopyConstructorImage) { MultimodalInput copy(original); EXPECT_TRUE(copy.is_image()); - EXPECT_EQ(copy.get_image().width, 224); - EXPECT_EQ(copy.get_image().height, 224); - EXPECT_EQ(copy.get_image().channels, 3); - EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged + EXPECT_EQ(copy.get_image().width(), 224); + EXPECT_EQ(copy.get_image().height(), 224); + EXPECT_EQ(copy.get_image().channels(), 3); + EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged } TEST_F(MultimodalInputTest, CopyAssignmentImage) { @@ -143,10 +134,10 @@ TEST_F(MultimodalInputTest, CopyAssignmentImage) { copy = original; EXPECT_TRUE(copy.is_image()); - EXPECT_EQ(copy.get_image().width, 224); - EXPECT_EQ(copy.get_image().height, 224); - EXPECT_EQ(copy.get_image().channels, 3); - EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged + EXPECT_EQ(copy.get_image().width(), 224); + EXPECT_EQ(copy.get_image().height(), 224); + EXPECT_EQ(copy.get_image().channels(), 3); + EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged } // Test move constructor and assignment @@ -174,32 +165,32 @@ TEST_F(MultimodalInputTest, MoveAssignmentText) { TEST_F(MultimodalInputTest, MoveConstructorImage) { Image img = createTestImage(); - int width = img.width; - int height = img.height; - int channels = img.channels; + int width = img.width(); + int height = img.height(); + int channels = img.channels(); MultimodalInput original(std::move(img)); MultimodalInput moved(std::move(original)); EXPECT_TRUE(moved.is_image()); - EXPECT_EQ(moved.get_image().width, width); - EXPECT_EQ(moved.get_image().height, height); - EXPECT_EQ(moved.get_image().channels, channels); + EXPECT_EQ(moved.get_image().width(), width); + EXPECT_EQ(moved.get_image().height(), height); + EXPECT_EQ(moved.get_image().channels(), channels); } TEST_F(MultimodalInputTest, MoveAssignmentImage) { Image img = createTestImage(); - int width = img.width; - int height = img.height; - int channels = img.channels; + int width = img.width(); + int height = img.height(); + int channels = img.channels(); MultimodalInput original(std::move(img)); MultimodalInput moved(createTestText()); // Start with different type moved = std::move(original); EXPECT_TRUE(moved.is_image()); - EXPECT_EQ(moved.get_image().width, width); - EXPECT_EQ(moved.get_image().height, height); - EXPECT_EQ(moved.get_image().channels, channels); + EXPECT_EQ(moved.get_image().width(), width); + EXPECT_EQ(moved.get_image().height(), height); + EXPECT_EQ(moved.get_image().channels(), channels); } // Test getter methods with correct types @@ -227,16 +218,13 @@ TEST_F(MultimodalInputTest, GetImageWithImageInput) { // Test const lvalue reference version const MultimodalInput& const_input = input; - EXPECT_EQ(const_input.get_image().width, 224); - - // Test mutable lvalue reference version - Image& mutable_image = input.get_image(); - mutable_image.width = 448; - EXPECT_EQ(input.get_image().width, 448); + EXPECT_EQ(const_input.get_image().width(), 224); + EXPECT_EQ(const_input.get_image().height(), 224); + EXPECT_EQ(const_input.get_image().channels(), 3); // Test rvalue reference version Image moved_image = std::move(input).get_image(); - EXPECT_EQ(moved_image.width, 448); + EXPECT_EQ(moved_image.width(), 224); } // Test getter methods with wrong types (should throw) @@ -296,18 +284,14 @@ TEST_F(MultimodalInputTest, TryGetImageWithImageInput) { const MultimodalInput& const_input = input; const Image* image_ptr = const_input.try_get_image(); ASSERT_NE(image_ptr, nullptr); - EXPECT_EQ(image_ptr->width, 224); - EXPECT_EQ(image_ptr->height, 224); - EXPECT_EQ(image_ptr->channels, 3); + EXPECT_EQ(image_ptr->width(), 224); + EXPECT_EQ(image_ptr->height(), 224); + EXPECT_EQ(image_ptr->channels(), 3); // Test mutable version Image* mutable_image_ptr = input.try_get_image(); ASSERT_NE(mutable_image_ptr, nullptr); - EXPECT_EQ(mutable_image_ptr->width, 224); - - // Modify through pointer - mutable_image_ptr->width = 448; - EXPECT_EQ(input.get_image().width, 448); + EXPECT_EQ(mutable_image_ptr->width(), 224); } TEST_F(MultimodalInputTest, TryGetImageWithTextInput) { @@ -344,22 +328,22 @@ TEST_F(MultimodalInputTest, MakeImageInputFromImage) { MultimodalInput input = make_image_input(img); EXPECT_TRUE(input.is_image()); - EXPECT_EQ(input.get_image().width, 224); - EXPECT_EQ(input.get_image().height, 224); - EXPECT_EQ(input.get_image().channels, 3); + EXPECT_EQ(input.get_image().width(), 224); + EXPECT_EQ(input.get_image().height(), 224); + EXPECT_EQ(input.get_image().channels(), 3); } TEST_F(MultimodalInputTest, MakeImageInputFromRvalueImage) { Image img = createTestImage(); - int width = img.width; - int height = img.height; - int channels = img.channels; + int width = img.width(); + int height = img.height(); + int channels = img.channels(); MultimodalInput input = make_image_input(std::move(img)); EXPECT_TRUE(input.is_image()); - EXPECT_EQ(input.get_image().width, width); - EXPECT_EQ(input.get_image().height, height); - EXPECT_EQ(input.get_image().channels, channels); + EXPECT_EQ(input.get_image().width(), width); + EXPECT_EQ(input.get_image().height(), height); + EXPECT_EQ(input.get_image().channels(), channels); } // Test with different image sizes @@ -368,10 +352,10 @@ TEST_F(MultimodalInputTest, DifferentImageSizes) { MultimodalInput input(small_img); EXPECT_TRUE(input.is_image()); - EXPECT_EQ(input.get_image().width, 32); - EXPECT_EQ(input.get_image().height, 32); - EXPECT_EQ(input.get_image().channels, 1); - EXPECT_EQ(input.get_image().data.size(), 32 * 32); + EXPECT_EQ(input.get_image().width(), 32); + EXPECT_EQ(input.get_image().height(), 32); + EXPECT_EQ(input.get_image().channels(), 1); + EXPECT_EQ(input.get_image().get_uint8_data().size(), 32 * 32); } // Test with empty text @@ -424,11 +408,10 @@ TEST_F(MultimodalInputTest, AssignmentBetweenTypes) { // Assign image to text input input = MultimodalInput(img); EXPECT_TRUE(input.is_image()); - EXPECT_EQ(input.get_image().width, 224); + EXPECT_EQ(input.get_image().width(), 224); // Assign text back to image input input = MultimodalInput(text); EXPECT_TRUE(input.is_text()); EXPECT_EQ(input.get_text(), text); } -} // namespace