[multimodal] Allow float32 image input (#14359)

larryliu0820 · pytorchbot · commit de023c101af0 · 2025-09-22T21:47:10.000Z
Letting `Image` class support both `uint8_t` and `float` data types, changing `MultimodalPrefiller` class to support text, image, and audio modalities with error checking and modularity. **Image Data Handling and Type Safety:** * Refactored the `Image` class in `image.h` from a simple struct to a class that uses a `std::variant` to support both `uint8_t` and `float` image data, providing type-safe accessors and a `toTensor` method for conversion to tensors. * Updated `load_image` in Llava `main.cpp` to construct `Image` objects using the new class interface and move semantics, ensuring correct data layout and encapsulation. * Added a runtime check in `LlavaImagePrefiller` to ensure only `uint8_t` images are processed, using the new type-checking methods. **Multimodal Prefill Logic and Flexibility:** * Updated the `MultimodalPrefiller` class in `multimodal_prefiller.h` to dynamically check input types, validate tensor types against model expectations, and handles encoder/decoder execution with improved error handling and modularity. (cherry picked from commit bc18834)
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
@@ -75,24 +75,20 @@ void load_image(const std::string& image_path, Image& image) {
       new_height,
       0,
       channels);
-  // transpose to CHW
-  image.data.resize(channels * new_width * new_height);
+  std::vector<uint8_t> chw_data(channels * new_width * new_height);
   for (int i = 0; i < new_width * new_height; ++i) {
     for (int c = 0; c < channels; ++c) {
-      image.data[c * new_width * new_height + i] =
-          resized_data[i * channels + c];
+      chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
     }
   }
-  image.width = new_width;
-  image.height = new_height;
-  image.channels = channels;
+  image = Image(std::move(chw_data), new_width, new_height, channels);
   // convert to tensor
   ET_LOG(
       Info,
       "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
-      image.channels,
-      image.height,
-      image.width);
+      image.channels(),
+      image.height(),
+      image.width());
   stbi_image_free(data);
 }
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       for (int i = 0; i < image_size; i++) {
         image_data[i] = image_data_jint[i];
       }
-      llm::Image image_runner{image_data, width, height, channels};
+      llm::Image image_runner{std::move(image_data), width, height, channels};
       prefill_inputs_.emplace_back(
           llm::MultimodalInput{std::move(image_runner)});
     }
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
         std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
-          .data = std::move(data),
-          .width = (int32_t)image.width,
-          .height = (int32_t)image.height,
-          .channels = (int32_t)image.channels
-        }));
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+          std::move(data),
+          (int32_t)image.width,
+          (int32_t)image.height,
+          (int32_t)image.channels
+        )));
         break;
       }
       default: {
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
@@ -10,19 +10,112 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
+#include <cstddef>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
 
-struct ET_EXPERIMENTAL Image {
+class ET_EXPERIMENTAL Image {
+ public:
+  // Default constructor
+  Image() : width_(0), height_(0), channels_(0) {}
+
+  // Constructor for uint8_t data
+  Image(
+      std::vector<uint8_t>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Constructor for float data
+  Image(
+      std::vector<float>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Getters
+  int32_t width() const {
+    return width_;
+  }
+  int32_t height() const {
+    return height_;
+  }
+  int32_t channels() const {
+    return channels_;
+  }
+
+  // Data access
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
+    // tensor (NCHW). The caller should handle reshaping if needed.
+    std::vector<executorch::aten::SizesType> sizes = {
+        channels(), height(), width()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error, "Image data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
   // Assuming NCHW format
-  std::vector<uint8_t> data;
-  int32_t width;
-  int32_t height;
-  int32_t channels;
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t width_;
+  int32_t height_;
+  int32_t channels_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -43,9 +43,9 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     Image image = input.get_image();
 
     auto method_meta = ET_UNWRAP(
-        module_->method_meta(kVisionEncoderMethod),
+        module_->method_meta(kImageEncoderMethod),
         "Failed to get method_meta for %s",
-        kVisionEncoderMethod);
+        kImageEncoderMethod);
 
     ET_CHECK_MSG(
         method_meta.num_inputs() > 0,
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {`
`268`	`268`	`for (int i = 0; i < image_size; i++) {`
`269`	`269`	`image_data[i] = image_data_jint[i];`
`270`	`270`	`}`
`271`		`- llm::Image image_runner{image_data, width, height, channels};`
	`271`	`+ llm::Image image_runner{std::move(image_data), width, height, channels};`
`272`	`272`	`prefill_inputs_.emplace_back(`
`273`	`273`	`llm::MultimodalInput{std::move(image_runner)});`
`274`	`274`	`}`