Skip to content

Commit 0ad3c71

Browse files
committed
More changes
1 parent c381116 commit 0ad3c71

File tree

4 files changed

+47
-22
lines changed

4 files changed

+47
-22
lines changed

examples/models/llava/main.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ int32_t main(int32_t argc, char** argv) {
131131
#endif
132132
// Load tokenizer
133133
std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
134-
std::make_unique<tokenizers::Llama2cTokenizer>();
135-
tokenizer->load(tokenizer_path);
134+
::executorch::extension::llm::load_tokenizer(tokenizer_path);
136135
if (tokenizer == nullptr) {
137136
ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
138137
return 1;

extension/llm/runner/multimodal_prefiller.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,24 @@ Result<uint64_t> MultimodalPrefiller::prefill(
4747
"Failed to get method_meta for %s",
4848
kImageEncoderMethod);
4949

50-
ET_CHECK_MSG(
50+
ET_CHECK_OR_RETURN_ERROR(
5151
method_meta.num_inputs() > 0,
52+
InvalidArgument,
5253
"Image encoder should have at least 1 input");
5354
auto input_meta = ET_UNWRAP(
5455
method_meta.input_tensor_meta(0),
5556
"Cannot get input tensor meta at index 0");
5657
auto expected_dtype = input_meta.scalar_type();
5758

5859
if (expected_dtype == ::executorch::aten::ScalarType::Float) {
59-
ET_CHECK_MSG(
60+
ET_CHECK_OR_RETURN_ERROR(
6061
image.is_float(),
62+
InvalidArgument,
6163
"Model expects float image data, but image has uint8_t data.");
6264
} else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
63-
ET_CHECK_MSG(
65+
ET_CHECK_OR_RETURN_ERROR(
6466
image.is_uint8(),
67+
InvalidArgument,
6568
"Model expects uint8_t image data, but image has float data.");
6669
} else {
6770
ET_LOG(
@@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
7780
auto image_tensor = ET_UNWRAP(
7881
image.toTensor(/*with_batch*/ expected_dims.size() == 4),
7982
"Failed to convert image to tensor");
80-
83+
ET_LOG(
84+
Info,
85+
"Image tensor dim: %zu, dtype: %s",
86+
image_tensor->dim(),
87+
::executorch::runtime::toString(image_tensor->scalar_type()));
8188
// Run image encoder
8289
auto image_encoder_outputs =
8390
ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));

extension/llm/runner/multimodal_runner.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ Error MultimodalRunner::generate(
106106
// Process multimodal inputs in order
107107
for (size_t i = 0; i < inputs.size(); ++i) {
108108
const MultimodalInput& input = inputs[i];
109+
ET_LOG(
110+
Info,
111+
"Prefilling input %zu/%zu, type: %s",
112+
i,
113+
inputs.size(),
114+
input.is_text() ? "text" : (input.is_image() ? "image" : "unknown"));
115+
stats_->token_encode_end_ms = time_in_ms();
109116
if (config.echo && i == inputs.size() - 1 && input.is_text()) {
110117
wrapped_callback(input.get_text());
111118
}

extension/llm/runner/pybindings.cpp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ PYBIND11_MODULE(_llm_runner, m) {
277277
}
278278
return py::none();
279279
})
280+
.def(
281+
"get_image",
282+
[](const MultimodalInput& input) -> py::object {
283+
if (input.is_image()) {
284+
return py::cast(input.get_image());
285+
}
286+
return py::none();
287+
})
280288
.def("__repr__", [](const MultimodalInput& input) -> std::string {
281289
if (input.is_text()) {
282290
return "<MultimodalInput type=text content=\"" +
@@ -336,23 +344,27 @@ PYBIND11_MODULE(_llm_runner, m) {
336344
"Image must have 3 (RGB) or 4 (RGBA) channels");
337345
}
338346

339-
if (image_tensor.scalar_type() != torch::kUInt8) {
340-
if (image_tensor.max().item<double>() <= 1.0) {
341-
image_tensor = (image_tensor * 255).to(torch::kUInt8);
342-
} else {
343-
image_tensor = image_tensor.to(torch::kUInt8);
344-
}
345-
}
346-
347347
image_tensor = image_tensor.contiguous();
348-
uint8_t* data = image_tensor.data_ptr<uint8_t>();
349-
std::vector<uint8_t> image_data(data, data + image_tensor.numel());
350-
351-
return MultimodalInput(Image(
352-
std::move(image_data),
353-
static_cast<int32_t>(width),
354-
static_cast<int32_t>(height),
355-
static_cast<int32_t>(channels)));
348+
if (image_tensor.scalar_type() == torch::kUInt8) {
349+
uint8_t* data = image_tensor.data_ptr<uint8_t>();
350+
std::vector<uint8_t> image_data(data, data + image_tensor.numel());
351+
return MultimodalInput(Image(
352+
std::move(image_data),
353+
static_cast<int32_t>(width),
354+
static_cast<int32_t>(height),
355+
static_cast<int32_t>(channels)));
356+
} else if (image_tensor.scalar_type() == torch::kFloat) {
357+
float* data = image_tensor.data_ptr<float>();
358+
std::vector<float> image_data(data, data + image_tensor.numel());
359+
return MultimodalInput(Image(
360+
std::move(image_data),
361+
static_cast<int32_t>(width),
362+
static_cast<int32_t>(height),
363+
static_cast<int32_t>(channels)));
364+
} else {
365+
throw std::runtime_error(
366+
"Unsupported image tensor dtype. Only uint8 and float32 are supported.");
367+
}
356368
},
357369
"Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
358370
py::arg("image_tensor"));

0 commit comments

Comments
 (0)