More changes

larryliu0820 · larryliu0820 · commit c38111693b8a · 2025-09-17T15:29:04.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -650,7 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
 endif()
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel):
 
     lowered_and_edge = to_edge_transform_and_lower(
         {
-            "image_encoder": image_encoder_ep,
+            "vision_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
             "text_decoder": text_model_ep,
         },
         partitioner={
-            "image_encoder": [XnnpackPartitioner()],
+            "vision_encoder": [XnnpackPartitioner()],
             "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
-                "image_encoder": ConstraintBasedSymShapeEvalPass(),
+                "vision_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
@@ -93,7 +93,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   )
   # Link with the extension_llm_runner library and its dependencies
   target_link_libraries(
-    _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers portable_lib
+    _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers
+                        portable_lib
   )
 
   # Set properties for the Python extension
@@ -105,7 +106,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   )
 
   # Add include directories
-  target_include_directories(_llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS})
+  target_include_directories(
+    _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
+  )
 
   install(TARGETS _llm_runner
           LIBRARY DESTINATION executorch/extension/llm/runner
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
@@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache";
 inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
-inline constexpr auto kImageEncoderMethod = "image_encoder";
+inline constexpr auto kImageEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
@@ -219,15 +219,42 @@ PYBIND11_MODULE(_llm_runner, m) {
 
   // Bind Image class
   py::class_<Image>(m, "Image")
-      .def(py::init<>())
-      .def_readwrite("data", &Image::data)
-      .def_readwrite("width", &Image::width)
-      .def_readwrite("height", &Image::height)
-      .def_readwrite("channels", &Image::channels)
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def(
+          py::init<std::vector<float>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def("is_uint8", &Image::is_uint8)
+      .def("is_float", &Image::is_float)
+      .def_property_readonly("width", &Image::width)
+      .def_property_readonly("height", &Image::height)
+      .def_property_readonly("channels", &Image::channels)
+      .def_property_readonly(
+          "uint8_data",
+          static_cast<const std::vector<uint8_t>& (Image::*)() const&>(
+              &Image::get_uint8_data))
+      .def_property_readonly(
+          "float_data",
+          static_cast<const std::vector<float>& (Image::*)() const&>(
+              &Image::get_float_data))
       .def("__repr__", [](const Image& img) {
-        return "<Image height=" + std::to_string(img.height) +
-            " width=" + std::to_string(img.width) +
-            " channels=" + std::to_string(img.channels) + ">";
+        std::string dtype = "unknown";
+        if (img.is_uint8()) {
+          dtype = "uint8";
+        } else if (img.is_float()) {
+          dtype = "float32";
+        }
+        return "<Image height=" + std::to_string(img.height()) +
+            " width=" + std::to_string(img.width()) +
+            " channels=" + std::to_string(img.channels()) + " dtype=" + dtype +
+            ">";
       });
 
   // Bind MultimodalInput
@@ -281,7 +308,6 @@ PYBIND11_MODULE(_llm_runner, m) {
           image_tensor = image_tensor.squeeze(0);
         }
 
-        
         if (image_tensor.dim() != 3) {
           throw std::runtime_error(
               "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
@@ -322,12 +348,11 @@ PYBIND11_MODULE(_llm_runner, m) {
         uint8_t* data = image_tensor.data_ptr<uint8_t>();
         std::vector<uint8_t> image_data(data, data + image_tensor.numel());
 
-        Image image;
-        image.data = std::move(image_data);
-        image.width = static_cast<int32_t>(width);
-        image.height = static_cast<int32_t>(height);
-        image.channels = static_cast<int32_t>(channels);
-        return MultimodalInput(std::move(image));
+        return MultimodalInput(Image(
+            std::move(image_data),
+            static_cast<int32_t>(width),
+            static_cast<int32_t>(height),
+            static_cast<int32_t>(channels)));
       },
       "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
       py::arg("image_tensor"));

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,8 @@ if(EXECUTORCH_BUILD_PYBIND)`
`93`	`93`	`)`
`94`	`94`	`# Link with the extension_llm_runner library and its dependencies`
`95`	`95`	`target_link_libraries(`
`96`		`- _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers portable_lib`
	`96`	`+ _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers`
	`97`	`+ portable_lib`
`97`	`98`	`)`
`98`	`99`
`99`	`100`	`# Set properties for the Python extension`
`@@ -105,7 +106,9 @@ if(EXECUTORCH_BUILD_PYBIND)`
`105`	`106`	`)`
`106`	`107`
`107`	`108`	`# Add include directories`
`108`		`- target_include_directories(_llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS})`
	`109`	`+ target_include_directories(`
	`110`	`+ _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}`
	`111`	`+ )`
`109`	`112`
`110`	`113`	`install(TARGETS _llm_runner`
`111`	`114`	`LIBRARY DESTINATION executorch/extension/llm/runner`