More changes

larryliu0820 · larryliu0820 · commit 0ad3c71242a2 · 2025-09-18T00:14:03.000-07:00
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
@@ -131,8 +131,7 @@ int32_t main(int32_t argc, char** argv) {
 #endif
   // Load tokenizer
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
-      std::make_unique<tokenizers::Llama2cTokenizer>();
-  tokenizer->load(tokenizer_path);
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
   if (tokenizer == nullptr) {
     ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
     return 1;
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -47,21 +47,24 @@ Result<uint64_t> MultimodalPrefiller::prefill(
         "Failed to get method_meta for %s",
         kImageEncoderMethod);
 
-    ET_CHECK_MSG(
+    ET_CHECK_OR_RETURN_ERROR(
         method_meta.num_inputs() > 0,
+        InvalidArgument,
         "Image encoder should have at least 1 input");
     auto input_meta = ET_UNWRAP(
         method_meta.input_tensor_meta(0),
         "Cannot get input tensor meta at index 0");
     auto expected_dtype = input_meta.scalar_type();
 
     if (expected_dtype == ::executorch::aten::ScalarType::Float) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_float(),
+          InvalidArgument,
           "Model expects float image data, but image has uint8_t data.");
     } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_uint8(),
+          InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
     } else {
       ET_LOG(
@@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto image_tensor = ET_UNWRAP(
         image.toTensor(/*with_batch*/ expected_dims.size() == 4),
         "Failed to convert image to tensor");
-
+    ET_LOG(
+        Info,
+        "Image tensor dim: %zu, dtype: %s",
+        image_tensor->dim(),
+        ::executorch::runtime::toString(image_tensor->scalar_type()));
     // Run image encoder
     auto image_encoder_outputs =
         ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
@@ -106,6 +106,13 @@ Error MultimodalRunner::generate(
   // Process multimodal inputs in order
   for (size_t i = 0; i < inputs.size(); ++i) {
     const MultimodalInput& input = inputs[i];
+    ET_LOG(
+        Info,
+        "Prefilling input %zu/%zu, type: %s",
+        i,
+        inputs.size(),
+        input.is_text() ? "text" : (input.is_image() ? "image" : "unknown"));
+    stats_->token_encode_end_ms = time_in_ms();
     if (config.echo && i == inputs.size() - 1 && input.is_text()) {
       wrapped_callback(input.get_text());
     }
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
@@ -277,6 +277,14 @@ PYBIND11_MODULE(_llm_runner, m) {
             }
             return py::none();
           })
+      .def(
+          "get_image",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_image()) {
+              return py::cast(input.get_image());
+            }
+            return py::none();
+          })
       .def("__repr__", [](const MultimodalInput& input) -> std::string {
         if (input.is_text()) {
           return "<MultimodalInput type=text content=\"" +
@@ -336,23 +344,27 @@ PYBIND11_MODULE(_llm_runner, m) {
               "Image must have 3 (RGB) or 4 (RGBA) channels");
         }
 
-        if (image_tensor.scalar_type() != torch::kUInt8) {
-          if (image_tensor.max().item<double>() <= 1.0) {
-            image_tensor = (image_tensor * 255).to(torch::kUInt8);
-          } else {
-            image_tensor = image_tensor.to(torch::kUInt8);
-          }
-        }
-
         image_tensor = image_tensor.contiguous();
-        uint8_t* data = image_tensor.data_ptr<uint8_t>();
-        std::vector<uint8_t> image_data(data, data + image_tensor.numel());
-
-        return MultimodalInput(Image(
-            std::move(image_data),
-            static_cast<int32_t>(width),
-            static_cast<int32_t>(height),
-            static_cast<int32_t>(channels)));
+        if (image_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = image_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else if (image_tensor.scalar_type() == torch::kFloat) {
+          float* data = image_tensor.data_ptr<float>();
+          std::vector<float> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else {
+          throw std::runtime_error(
+              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
+        }
       },
       "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
       py::arg("image_tensor"));