Fix tests

larryliu0820 · larryliu0820 · commit a602201db424 · 2025-09-22T00:55:48.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -650,10 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
   install(
@@ -904,6 +900,10 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py
@@ -31,6 +31,183 @@
     )
 
 
+import logging
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+
+def _find_image_token_runs(
+    input_ids: torch.Tensor, image_token_id: Optional[int]
+) -> List[tuple[int, int, int]]:
+    """Return contiguous runs (start, end, length) of image_token_id in input_ids.
+
+    input_ids must be a 1D torch.Tensor. If image_token_id is None, returns an empty list.
+    """
+    if image_token_id is None:
+        return []
+
+    ids_list = input_ids.tolist()
+    runs: List[tuple[int, int, int]] = []
+    i = 0
+    L = len(ids_list)
+    while i < L:
+        if ids_list[i] == image_token_id:
+            j = i
+            while j < L and ids_list[j] == image_token_id:
+                j += 1
+            runs.append((i, j - 1, j - i))
+            i = j
+        else:
+            i += 1
+
+    return runs
+
+
+def _hf_to_multimodal_inputs(  # noqa: C901
+    inputs: Dict[str, Any], image_token_id: Optional[int] = None
+) -> List[MultimodalInput]:
+    """Convert a HuggingFace AutoProcessor dict to ExecuTorch MultimodalInputs.
+    Currently only support 1 image inside the input.
+
+    Args:
+      - inputs: A dictionary containing the input data.
+      - image_token_id: The token ID for the image, if present.
+
+    `inputs` expected keys:
+      - 'input_ids': torch.Tensor of shape (L,) or (1, L)
+      - Optional 'pixel_values': torch.Tensor; if present, must also provide
+        'image_token_id' (or alias 'image_token_index') and there must be
+        exactly one image token occurrence in input_ids.
+
+    Raises:
+      RuntimeError: missing keys, invalid shapes/dtypes, or unsupported cases.
+    """
+    if "input_ids" not in inputs:
+        raise RuntimeError("HF inputs dict must contain 'input_ids' (torch.Tensor)")
+
+    input_ids = inputs["input_ids"]
+    if not isinstance(input_ids, torch.Tensor):
+        raise RuntimeError("'input_ids' must be a torch.Tensor")
+
+    if input_ids.dim() == 2:
+        if input_ids.size(0) != 1:
+            raise RuntimeError(
+                "Expected 'input_ids' with batch size 1 when 2D (shape (1, L))"
+            )
+        input_ids = input_ids.squeeze(0)
+    if input_ids.dim() != 1:
+        raise RuntimeError("'input_ids' must be 1D (L) or 2D with batch size 1")
+
+    has_pixel_values = "pixel_values" in inputs
+
+    # If pixel_values in dict, require image_token_id
+    if has_pixel_values and image_token_id is None:
+        raise RuntimeError("'pixel_values' provided but missing 'image_token_id'")
+
+    # If there are image token ids but no pixel_values, it's an error
+    if (
+        image_token_id is not None
+        and (input_ids == image_token_id).any().item()
+        and not has_pixel_values
+    ):
+        raise RuntimeError(
+            "Found image token(s) in input_ids but 'pixel_values' not provided"
+        )
+
+    # No images: return a single tokens input
+    if not has_pixel_values:
+        return [make_token_input(input_ids.to(torch.long).tolist())]
+
+    # Determine number of images from pixel_values shape
+    pv = inputs["pixel_values"]
+    if not isinstance(pv, torch.Tensor):
+        raise RuntimeError(
+            "'pixel_values' must be a torch.Tensor, run with `return_tensors='pt'` in HF processor"
+        )
+    if pv.dim() == 4:
+        num_images = int(pv.size(0))
+    elif pv.dim() == 3:
+        num_images = 1
+    else:
+        raise RuntimeError(
+            f"'pixel_values' must be 3D (C,H,W) or 4D (N,C,H,W)/(N,H,W,C), got shape {pv.shape}"
+        )
+
+    # Only support batch size 1 for now:
+    if num_images != 1:
+        raise RuntimeError("Only 1 image is supported for now")
+    # Find contiguous runs of image_token_id in input_ids
+    runs = _find_image_token_runs(input_ids, image_token_id)
+
+    if len(runs) == 0:
+        raise RuntimeError(
+            "'pixel_values' provided but no occurrence of 'image_token_id' in input_ids"
+        )
+
+    # Support only one image/run for now; enforce exact match
+    if num_images != 1 or len(runs) != 1:
+        raise RuntimeError(
+            f"Mismatch between images and image token runs: images={num_images}, runs={len(runs)} (only batch=1 and a single contiguous run are supported)"
+        )
+
+    first, last, _ = runs[0]
+
+    combined: List[MultimodalInput] = []
+    if first > 0:
+        combined.append(make_token_input(input_ids[:first].to(torch.long).tolist()))
+
+    # Use C++ checked creator for images (handles 3D/4D, CHW/HWC, uint8/float32)
+    combined.append(make_image_input(inputs["pixel_values"]))
+
+    if (last + 1) < input_ids.numel():
+        combined.append(make_token_input(input_ids[last + 1 :].to(torch.long).tolist()))
+
+    return combined
+
+
+def generate(
+    runner: MultimodalRunner,
+    inputs: Union[Dict[str, Any], List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+    token_callback: Optional[Callable[[str], None]] = None,
+    stats_callback: Optional[Callable[[Stats], None]] = None,
+) -> None:
+    """Generate using an HF dict by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, dict):
+        logging.info(
+            "Input is a dict, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    else:
+        converted = inputs
+
+    runner.generate(converted, config, token_callback, stats_callback)
+
+
+def generate_text(
+    runner: MultimodalRunner,
+    inputs: Union[Dict[str, Any], List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+) -> str:
+    """Generate using an HF dict by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, dict):
+        logging.info(
+            "Input is a dict, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    else:
+        converted = inputs
+
+    return runner.generate_text(converted, config)
+
+
+setattr(MultimodalRunner, "generate", generate)  # noqa B010
+setattr(MultimodalRunner, "generate_text", generate_text)  # noqa B010
+
+
 __all__ = [
     "GenerationConfig",
     "Image",
diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi
@@ -368,6 +368,31 @@ class MultimodalRunner:
         Raises:
             RuntimeError: If generation fails
         """
+    ...
+
+    def generate(
+        self,
+        inputs: dict,
+        config: GenerationConfig,
+        token_callback: Optional[Callable[[str], None]] = None,
+        stats_callback: Optional[Callable[[Stats], None]] = None,
+    ) -> None:
+        """
+        Generate text directly from a HuggingFace processor dict.
+
+        Expects at least 'input_ids' (torch.Tensor). If 'pixel_values' is provided,
+        an 'image_token_id' (or 'image_token_index') must also be present to locate
+        the image position(s) in input_ids.
+
+        Args:
+            inputs: HF processor outputs (e.g., from AutoProcessor.apply_chat_template)
+            config: Generation configuration
+            token_callback: Optional per-token callback
+            stats_callback: Optional stats callback
+
+        Raises:
+            RuntimeError: If required keys are missing, shapes are invalid, or generation fails
+        """
         ...
 
     def prefill(self, inputs: List[MultimodalInput]) -> None:
@@ -399,6 +424,14 @@ class MultimodalRunner:
         Raises:
             RuntimeError: If generation fails
         """
+    ...
+
+    def generate_text(self, inputs: dict, config: GenerationConfig) -> str:
+        """
+        Generate text directly from a HuggingFace processor dict and return as string.
+
+        See generate(inputs: dict, ...) for expected keys and constraints.
+        """
         ...
 
     def stop(self) -> None:
diff --git a/extension/llm/runner/test/test_runner_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-import numpy as np
+import torch
 from executorch.extension.llm.runner import (
     GenerationConfig,
     Image,
@@ -118,25 +118,18 @@ class TestImage(unittest.TestCase):
 
     def test_creation(self):
         """Test creating an Image object."""
-        image = Image()
+        # Construct using binding constructor (uint8 data)
+        image = Image([1, 2, 3, 4], 2, 2, 1)
 
-        # Set properties
-        image.data = [1, 2, 3, 4]
-        image.width = 2
-        image.height = 2
-        image.channels = 1
-
-        self.assertEqual(image.data, [1, 2, 3, 4])
+        # Properties are read-only
+        self.assertEqual(image.uint8_data, [1, 2, 3, 4])
         self.assertEqual(image.width, 2)
         self.assertEqual(image.height, 2)
         self.assertEqual(image.channels, 1)
 
     def test_repr(self):
         """Test string representation."""
-        image = Image()
-        image.width = 640
-        image.height = 480
-        image.channels = 3
+        image = Image([0] * (480 * 640 * 3), 640, 480, 3)
 
         repr_str = repr(image)
         self.assertIn("Image", repr_str)
@@ -164,33 +157,29 @@ def test_text_input(self):
     def test_image_input(self):
         """Test creating an image MultimodalInput."""
         # Create an image
-        image = Image()
-        image.data = [255] * (100 * 100 * 3)
-        image.width = 100
-        image.height = 100
-        image.channels = 3
+        image = Image([255] * (100 * 100 * 3), 100, 100, 3)
 
         # Test direct constructor
         image_input = MultimodalInput(image)
         self.assertTrue(image_input.is_image())
         self.assertFalse(image_input.is_text())
 
-        # Test helper function with numpy array
-        img_array = np.ones((50, 60, 3), dtype=np.uint8) * 128
-        image_input2 = make_image_input(img_array)
+        # Test helper function with torch tensor (CHW)
+        img_tensor = torch.ones((3, 50, 60), dtype=torch.uint8) * 128
+        image_input2 = make_image_input(img_tensor)
         self.assertTrue(image_input2.is_image())
         self.assertFalse(image_input2.is_text())
 
     def test_invalid_image_array(self):
         """Test error handling for invalid image arrays."""
-        # Wrong dimensions
+        # Wrong dimensions (expects 3D or 4D tensor)
         with self.assertRaises(RuntimeError) as cm:
-            make_image_input(np.ones((100,), dtype=np.uint8))
+            make_image_input(torch.ones((100,), dtype=torch.uint8))
         self.assertIn("3-dimensional", str(cm.exception))
 
         # Wrong number of channels
         with self.assertRaises(RuntimeError) as cm:
-            make_image_input(np.ones((100, 100, 2), dtype=np.uint8))
+            make_image_input(torch.ones((2, 100, 100), dtype=torch.uint8))
         self.assertIn("3 (RGB) or 4 (RGBA)", str(cm.exception))
 
     def test_repr(self):
@@ -209,7 +198,7 @@ def test_repr(self):
         self.assertIn("...", repr_str2)
 
         # Image input
-        image = Image()
+        image = Image([0, 0, 0], 1, 1, 3)
         image_input = MultimodalInput(image)
         repr_str3 = repr(image_input)
         self.assertIn("type=image", repr_str3)
@@ -256,14 +245,14 @@ def test_make_text_input(self):
 
     def test_make_image_input(self):
         """Test make_image_input helper."""
-        # Create a test image array (RGB)
-        img_array = np.zeros((100, 150, 3), dtype=np.uint8)
-        img_array[:, :, 0] = 255  # Red channel
+        # Create a test image tensor (RGB, CHW)
+        img_tensor = torch.zeros((3, 100, 150), dtype=torch.uint8)
+        img_tensor[0, :, :] = 255  # Red channel
 
-        image_input = make_image_input(img_array)
+        image_input = make_image_input(img_tensor)
         self.assertTrue(image_input.is_image())
 
-        # Test with RGBA
-        img_array_rgba = np.ones((50, 50, 4), dtype=np.uint8) * 128
-        image_input_rgba = make_image_input(img_array_rgba)
+        # Test with RGBA (CHW)
+        img_tensor_rgba = torch.ones((4, 50, 50), dtype=torch.uint8) * 128
+        image_input_rgba = make_image_input(img_tensor_rgba)
         self.assertTrue(image_input_rgba.is_image())