Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions examples/models/llava/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,20 @@ void load_image(const std::string& image_path, Image& image) {
new_height,
0,
channels);
// transpose to CHW
image.data.resize(channels * new_width * new_height);
std::vector<uint8_t> chw_data(channels * new_width * new_height);
for (int i = 0; i < new_width * new_height; ++i) {
for (int c = 0; c < channels; ++c) {
image.data[c * new_width * new_height + i] =
resized_data[i * channels + c];
chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
}
}
image.width = new_width;
image.height = new_height;
image.channels = channels;
image = Image(std::move(chw_data), new_width, new_height, channels);
// convert to tensor
ET_LOG(
Info,
"image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
image.channels,
image.height,
image.width);
image.channels(),
image.height(),
image.width());
stbi_image_free(data);
}

Expand Down
2 changes: 1 addition & 1 deletion extension/android/jni/jni_layer_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
for (int i = 0; i < image_size; i++) {
image_data[i] = image_data_jint[i];
}
llm::Image image_runner{image_data, width, height, channels};
llm::Image image_runner{std::move(image_data), width, height, channels};
prefill_inputs_.emplace_back(
llm::MultimodalInput{std::move(image_runner)});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
case ExecuTorchLLMMultimodalInputTypeImage: {
ExecuTorchLLMImage *image = input.image;
std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
.data = std::move(data),
.width = (int32_t)image.width,
.height = (int32_t)image.height,
.channels = (int32_t)image.channels
}));
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
std::move(data),
(int32_t)image.width,
(int32_t)image.height,
(int32_t)image.channels
)));
break;
}
default: {
Expand Down
103 changes: 98 additions & 5 deletions extension/llm/runner/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,112 @@

#pragma once
#include <executorch/runtime/platform/compiler.h>
#include <cstddef>
#include <cstdint>
#include <variant>
#include <vector>

#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>

namespace executorch {
namespace extension {
namespace llm {

struct ET_EXPERIMENTAL Image {
class ET_EXPERIMENTAL Image {
public:
// Default constructor
Image() : width_(0), height_(0), channels_(0) {}

// Constructor for uint8_t data
Image(
std::vector<uint8_t>&& data,
int32_t width,
int32_t height,
int32_t channels)
: data_(std::move(data)),
width_(width),
height_(height),
channels_(channels) {}

// Constructor for float data
Image(
std::vector<float>&& data,
int32_t width,
int32_t height,
int32_t channels)
: data_(std::move(data)),
width_(width),
height_(height),
channels_(channels) {}

// Getters
int32_t width() const {
return width_;
}
int32_t height() const {
return height_;
}
int32_t channels() const {
return channels_;
}

// Data access
bool is_uint8() const {
return std::holds_alternative<std::vector<uint8_t>>(data_);
}

bool is_float() const {
return std::holds_alternative<std::vector<float>>(data_);
}

const std::vector<uint8_t>& get_uint8_data() const& {
return std::get<std::vector<uint8_t>>(data_);
}

std::vector<uint8_t>& get_uint8_data() & {
return std::get<std::vector<uint8_t>>(data_);
}

const std::vector<float>& get_float_data() const& {
return std::get<std::vector<float>>(data_);
}

std::vector<float>& get_float_data() & {
return std::get<std::vector<float>>(data_);
}

executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
bool with_batch = false) const {
// Note: This creates a 3D tensor (CHW). The model might expect a 4D
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like you already batch using with_batch so can rm this comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish it's easy to test. My plan is to setup some python test in the pybind PR

// tensor (NCHW). The caller should handle reshaping if needed.
std::vector<executorch::aten::SizesType> sizes = {
channels(), height(), width()};
if (with_batch) {
sizes.insert(sizes.begin(), 1);
}
if (is_float()) {
return executorch::extension::from_blob(
const_cast<float*>(get_float_data().data()),
sizes,
::executorch::aten::ScalarType::Float);
} else if (is_uint8()) {
return executorch::extension::from_blob(
const_cast<uint8_t*>(get_uint8_data().data()),
sizes,
::executorch::aten::ScalarType::Byte);
}
ET_LOG(
Error, "Image data is not initialized with uint8_t or float vector.");
return ::executorch::runtime::Error::NotSupported;
}

private:
// Assuming NCHW format
std::vector<uint8_t> data;
int32_t width;
int32_t height;
int32_t channels;
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
int32_t width_;
int32_t height_;
int32_t channels_;
};

} // namespace llm
Expand Down
40 changes: 36 additions & 4 deletions extension/llm/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,42 @@ Result<uint64_t> MultimodalPrefiller::prefill(
::executorch::runtime::EValue encoder_output;
if (input.is_image()) {
Image image = input.get_image();
auto image_tensor = executorch::extension::from_blob(
image.data.data(),
{3, image.height, image.width},
::executorch::aten::ScalarType::Byte);

auto method_meta = ET_UNWRAP(
module_->method_meta(kImageEncoderMethod),
"Failed to get method_meta for %s",
kImageEncoderMethod);

ET_CHECK_MSG(
method_meta.num_inputs() > 0,
"Image encoder should have at least 1 input");
auto input_meta = ET_UNWRAP(
method_meta.input_tensor_meta(0),
"Cannot get input tensor meta at index 0");
auto expected_dtype = input_meta.scalar_type();

if (expected_dtype == ::executorch::aten::ScalarType::Float) {
ET_CHECK_MSG(
image.is_float(),
"Model expects float image data, but image has uint8_t data.");
} else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
ET_CHECK_MSG(
image.is_uint8(),
"Model expects uint8_t image data, but image has float data.");
} else {
ET_LOG(
Error,
"Unsupported image encoder input dtype: %s",
::executorch::runtime::toString(expected_dtype));
return ::executorch::runtime::Error::NotSupported;
}

// The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
// tensor (CHW). Add a batch dimension of 1 if needed.
auto expected_dims = input_meta.sizes();
auto image_tensor = ET_UNWRAP(
image.toTensor(/*with_batch*/ expected_dims.size() == 4),
"Failed to convert image to tensor");

// Run image encoder
auto image_encoder_outputs =
Expand Down
Loading
Loading