make_image_input take tensor

larryliu0820 · larryliu0820 · commit 071a7b3f0df7 · 2025-09-17T15:29:04.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -650,10 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
-  list(APPEND _executorch_extensions extension_llm_runner)
-endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
@@ -889,6 +885,11 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   list(APPEND _executorch_extensions extension_training)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
+  list(APPEND _executorch_extensions extension_llm_runner)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
@@ -87,10 +87,13 @@ if(EXECUTORCH_BUILD_PYBIND)
     _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp
   )
 
+  find_package_torch()
+  find_library(
+    TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
+  )
   # Link with the extension_llm_runner library and its dependencies
   target_link_libraries(
-    _llm_runner PRIVATE extension_llm_runner executorch_core extension_module
-                        extension_tensor tokenizers::tokenizers
+    _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers portable_lib
   )
 
   # Set properties for the Python extension
@@ -102,7 +105,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   )
 
   # Add include directories
-  target_include_directories(_llm_runner PRIVATE ${_common_include_directories})
+  target_include_directories(_llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS})
 
   install(TARGETS _llm_runner
           LIBRARY DESTINATION executorch/extension/llm/runner
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
@@ -10,6 +10,7 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <torch/python.h>
 
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
@@ -271,27 +272,55 @@ PYBIND11_MODULE(_llm_runner, m) {
 
   m.def(
       "make_image_input",
-      [](py::array_t<uint8_t> image_array) -> MultimodalInput {
-        // Get image dimensions
-        py::buffer_info buf = image_array.request();
+      [](torch::Tensor image_tensor) -> MultimodalInput {
+        if (image_tensor.dim() == 4) {
+          if (image_tensor.size(0) != 1) {
+            throw std::runtime_error(
+                "Batch size for 4D image tensor must be 1");
+          }
+          image_tensor = image_tensor.squeeze(0);
+        }
 
-        if (buf.ndim != 3) {
+        
+        if (image_tensor.dim() != 3) {
           throw std::runtime_error(
-              "Image array must be 3-dimensional (H, W, C)");
+              "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
         }
 
-        size_t height = buf.shape[0];
-        size_t width = buf.shape[1];
-        size_t channels = buf.shape[2];
+        int64_t height, width, channels;
+        // Check for memory format and permute to CHW if necessary
+        if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+          // Input is HWC, permute to CHW
+          height = image_tensor.size(0);
+          width = image_tensor.size(1);
+          channels = image_tensor.size(2);
+          image_tensor = image_tensor.permute({2, 0, 1});
+        } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) {
+          // Input is CHW
+          channels = image_tensor.size(0);
+          height = image_tensor.size(1);
+          width = image_tensor.size(2);
+        } else {
+          throw std::runtime_error(
+              "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format.");
+        }
 
         if (channels != 3 && channels != 4) {
           throw std::runtime_error(
               "Image must have 3 (RGB) or 4 (RGBA) channels");
         }
 
-        // Create Image object from numpy array
-        uint8_t* data = static_cast<uint8_t*>(buf.ptr);
-        std::vector<uint8_t> image_data(data, data + height * width * channels);
+        if (image_tensor.scalar_type() != torch::kUInt8) {
+          if (image_tensor.max().item<double>() <= 1.0) {
+            image_tensor = (image_tensor * 255).to(torch::kUInt8);
+          } else {
+            image_tensor = image_tensor.to(torch::kUInt8);
+          }
+        }
+
+        image_tensor = image_tensor.contiguous();
+        uint8_t* data = image_tensor.data_ptr<uint8_t>();
+        std::vector<uint8_t> image_data(data, data + image_tensor.numel());
 
         Image image;
         image.data = std::move(image_data);
@@ -300,8 +329,8 @@ PYBIND11_MODULE(_llm_runner, m) {
         image.channels = static_cast<int32_t>(channels);
         return MultimodalInput(std::move(image));
       },
-      "Create an image input from a numpy array (H, W, C)",
-      py::arg("image_array"));
+      "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
+      py::arg("image_tensor"));
 
   // Bind PyMultimodalRunner
   py::class_<PyMultimodalRunner>(m, "MultimodalRunner")