Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions examples/models/llava/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,20 @@ void load_image(const std::string& image_path, Image& image) {
new_height,
0,
channels);
// transpose to CHW
image.data.resize(channels * new_width * new_height);
std::vector<uint8_t> chw_data(channels * new_width * new_height);
for (int i = 0; i < new_width * new_height; ++i) {
for (int c = 0; c < channels; ++c) {
image.data[c * new_width * new_height + i] =
resized_data[i * channels + c];
chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
}
}
image.width = new_width;
image.height = new_height;
image.channels = channels;
image = Image(std::move(chw_data), new_width, new_height, channels);
// convert to tensor
ET_LOG(
Info,
"image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
image.channels,
image.height,
image.width);
image.channels(),
image.height(),
image.width());
stbi_image_free(data);
}

Expand Down
16 changes: 8 additions & 8 deletions examples/models/llava/runner/llava_image_prefiller.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

namespace example {

using executorch::extension::llm::kImageEncoderMethod;
using executorch::extension::llm::kTextModelMethod;
using executorch::extension::llm::kVisionEncoderMethod;

class ET_EXPERIMENTAL LlavaImagePrefiller {
public:
Expand All @@ -34,12 +34,12 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
::executorch::extension::llm::Image& image,
int64_t& start_pos) {
auto image_tensor = executorch::extension::from_blob(
image.data.data(),
{3, image.height, image.width},
image.get_uint8_data().data(),
{3, image.height(), image.width()},
::executorch::aten::ScalarType::Byte);
// Run image encoder
auto image_encoder_outputs =
ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));

// inputs:[start_pos, embeds]
auto start_pos_tensor = executorch::extension::from_blob(
Expand Down Expand Up @@ -67,7 +67,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
if (is_method_loaded()) {
return ::executorch::runtime::Error::Ok;
}
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
return ::executorch::runtime::Error::Ok;
}
Expand All @@ -83,7 +83,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
ET_CHECK_MSG(false, "Failed to get method names");
}
std::unordered_set<std::string> methods = methods_res.get();
bool methods_exist = methods.find(kImageEncoderMethod) != methods.end() &&
bool methods_exist = methods.find(kVisionEncoderMethod) != methods.end() &&
methods.find(kTextModelMethod) != methods.end();
if (!methods_exist) {
for (const auto& method : methods) {
Expand All @@ -92,10 +92,10 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
ET_CHECK_MSG(
methods_exist,
"Missing required methods (%s, %s) in the model",
kImageEncoderMethod,
kVisionEncoderMethod,
kTextModelMethod);
}
bool methods_loaded = module_->is_method_loaded(kImageEncoderMethod) &&
bool methods_loaded = module_->is_method_loaded(kVisionEncoderMethod) &&
module_->is_method_loaded(kTextModelMethod);
return methods_loaded;
}
Expand Down
2 changes: 1 addition & 1 deletion extension/android/jni/jni_layer_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
for (int i = 0; i < image_size; i++) {
image_data[i] = image_data_jint[i];
}
llm::Image image_runner{image_data, width, height, channels};
llm::Image image_runner{std::move(image_data), width, height, channels};
prefill_inputs_.emplace_back(
llm::MultimodalInput{std::move(image_runner)});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
case ExecuTorchLLMMultimodalInputTypeImage: {
ExecuTorchLLMImage *image = input.image;
std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
.data = std::move(data),
.width = (int32_t)image.width,
.height = (int32_t)image.height,
.channels = (int32_t)image.channels
}));
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
std::move(data),
(int32_t)image.width,
(int32_t)image.height,
(int32_t)image.channels
)));
break;
}
default: {
Expand Down
103 changes: 98 additions & 5 deletions extension/llm/runner/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,112 @@

#pragma once
#include <executorch/runtime/platform/compiler.h>
#include <cstddef>
#include <cstdint>
#include <variant>
#include <vector>

#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>

namespace executorch {
namespace extension {
namespace llm {

struct ET_EXPERIMENTAL Image {
class ET_EXPERIMENTAL Image {
public:
// Default constructor
Image() : width_(0), height_(0), channels_(0) {}

// Constructor for uint8_t data
Image(
std::vector<uint8_t>&& data,
int32_t width,
int32_t height,
int32_t channels)
: data_(std::move(data)),
width_(width),
height_(height),
channels_(channels) {}

// Constructor for float data
Image(
std::vector<float>&& data,
int32_t width,
int32_t height,
int32_t channels)
: data_(std::move(data)),
width_(width),
height_(height),
channels_(channels) {}

// Getters
int32_t width() const {
return width_;
}
int32_t height() const {
return height_;
}
int32_t channels() const {
return channels_;
}

// Data access
bool is_uint8() const {
return std::holds_alternative<std::vector<uint8_t>>(data_);
}

bool is_float() const {
return std::holds_alternative<std::vector<float>>(data_);
}

const std::vector<uint8_t>& get_uint8_data() const& {
return std::get<std::vector<uint8_t>>(data_);
}

std::vector<uint8_t>& get_uint8_data() & {
return std::get<std::vector<uint8_t>>(data_);
}

const std::vector<float>& get_float_data() const& {
return std::get<std::vector<float>>(data_);
}

std::vector<float>& get_float_data() & {
return std::get<std::vector<float>>(data_);
}

executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
bool with_batch = false) const {
// Note: This creates a 3D tensor (CHW). The model might expect a 4D
// tensor (NCHW). The caller should handle reshaping if needed.
std::vector<executorch::aten::SizesType> sizes = {
channels(), height(), width()};
if (with_batch) {
sizes.insert(sizes.begin(), 1);
}
if (is_float()) {
return executorch::extension::from_blob(
const_cast<float*>(get_float_data().data()),
sizes,
::executorch::aten::ScalarType::Float);
} else if (is_uint8()) {
return executorch::extension::from_blob(
const_cast<uint8_t*>(get_uint8_data().data()),
sizes,
::executorch::aten::ScalarType::Byte);
}
ET_LOG(
Error, "Image data is not initialized with uint8_t or float vector.");
return ::executorch::runtime::Error::NotSupported;
}

private:
// Assuming NCHW format
std::vector<uint8_t> data;
int32_t width;
int32_t height;
int32_t channels;
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
int32_t width_;
int32_t height_;
int32_t channels_;
};

} // namespace llm
Expand Down
Loading
Loading