[llava] Enable dynamic shape for image preprocessor

larryliu0820 · web-flow · commit 2c7b7e8388c8 · 2024-08-26T15:41:30.000-07:00
Differential Revision: D61818152 Pull Request resolved: #4821
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
@@ -54,6 +54,13 @@ export_llava() {
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 }
 
+# Download a new image with different size, to test if the model can handle different image sizes
+prepare_image_tensor() {
+    echo "Downloading image"
+    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg 
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
+}
+
 run_and_verify() {
     NOW=$(date +"%H:%M:%S")
     echo "Starting to run llava runner at ${NOW}"
@@ -79,7 +86,12 @@ run_and_verify() {
     # verify result.txt
     RESULT=$(cat result.txt)
     # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-    EXPECTED_PREFIX="ASSISTANT:"
+    if [[ "$(uname)" == "Darwin" ]]; then
+        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress on a basketball court. There are several players on the court, with one player in the foreground holding a basketball, and"
+    else
+        # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
+        EXPECTED_PREFIX="ASSISTANT:"
+    fi
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
@@ -96,4 +108,5 @@ run_and_verify() {
 cmake_install_executorch_libraries
 cmake_build_llava_runner
 export_llava
+prepare_image_tensor
 run_and_verify
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -22,6 +22,7 @@
 from executorch.examples.models.llama2.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
+from executorch.examples.models.llava.image_util import serialize_image
 from executorch.examples.models.llava.model import LlavaModel
 from executorch.exir import (
     EdgeCompileConfig,
@@ -35,7 +36,6 @@
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 from executorch.extension.llm.tokenizer.tokenizer import Tokenizer
-from torch import nn
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -231,14 +231,7 @@ def get_image_tensor_for_llava_runner(llava_model):
     # llava runner doesn't have image reader so an image tensor is needed.
     (resized,) = llava_model.get_example_inputs()
 
-    copy = torch.tensor(resized)
-    m = nn.Module()
-    par = nn.Parameter(copy, requires_grad=False)
-    m.register_parameter("0", par)
-    tensors = torch.jit.script(m)
-    tensors.save("image.pt")
-
-    logging.info("Saved image tensor to image.pt")
+    serialize_image(resized, "image.pt")
 
 
 def get_tokenizer_for_llava_runner(llava_model):
diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Utility functions for image processing. Run it with your image:
+
+# python image_util.py --image-path <path_to_image>
+
+import logging
+from argparse import ArgumentParser
+
+import torch
+import torchvision
+from PIL import Image
+from torch import nn
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor:
+    """Read image into a tensor and resize the image so that it fits in
+    a target_h x target_w canvas.
+
+    Args:
+        image (Image): An Image object.
+        target_h (int): Target height.
+        target_w (int): Target width.
+
+    Returns:
+        torch.Tensor: resized image tensor.
+    """
+    img = torchvision.transforms.functional.pil_to_tensor(image)
+    # height ratio
+    ratio_h = img.shape[1] / target_h
+    # width ratio
+    ratio_w = img.shape[2] / target_w
+    # resize the image so that it fits in a target_h x target_w canvas
+    ratio = max(ratio_h, ratio_w)
+    output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio))
+    img = torchvision.transforms.Resize(size=output_size)(img)
+    return img
+
+
+def serialize_image(image: torch.Tensor, path: str) -> None:
+    copy = torch.tensor(image)
+    m = nn.Module()
+    par = nn.Parameter(copy, requires_grad=False)
+    m.register_parameter("0", par)
+    tensors = torch.jit.script(m)
+    tensors.save(path)
+
+    logging.info(f"Saved image tensor to {path}")
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--image-path",
+        required=True,
+        help="Path to the image.",
+    )
+    parser.add_argument(
+        "--output-path",
+        default="image.pt",
+    )
+    args = parser.parse_args()
+
+    image = Image.open(args.image_path)
+    image_tensor = prepare_image(image, target_h=336, target_w=336)
+    serialize_image(image_tensor, args.output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -6,20 +6,18 @@
 
 # An ExecuTorch friendly implementation of Llava-1.5.
 
-import math
-
 import re
 
 from typing import Any, Dict, Optional
 
 import requests
 import torch
-import torchvision
 from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
 
 from executorch.examples.models.llama2.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
+from executorch.examples.models.llava.image_util import prepare_image
 from executorch.examples.models.model_base import EagerModelBase
 from PIL import Image
 
@@ -156,19 +154,32 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor:
         return image_features
 
     def image_preprocess(self, img: torch.Tensor) -> torch.Tensor:
-        w = max(img.shape[1], img.shape[2])
+        target_h = self.image_processor.crop_size["height"]
+        target_w = self.image_processor.crop_size["width"]
         # pad the image with median rgb value, to make a square
-        v_padding = (w - img.shape[1]) / 2
-        h_padding = (w - img.shape[2]) / 2
-        l_pad = int(math.ceil(h_padding))
-        t_pad = int(math.ceil(v_padding))
-        r_pad = int(math.floor(h_padding))
-        b_pad = int(math.floor(v_padding))
-        resized = F.pad(
+        l_pad = (target_w - img.shape[2]) // 2
+        t_pad = (target_h - img.shape[1]) // 2
+        # ceil division
+        r_pad = -((target_w - img.shape[2]) // -2)
+        b_pad = -((target_h - img.shape[1]) // -2)
+
+        torch._check(l_pad >= 0)
+        torch._check(t_pad >= 0)
+        torch._check(r_pad >= 0)
+        torch._check(b_pad >= 0)
+
+        # This is different from the original implementation, due to export limitations.
+        resized = torch.nn.functional.pad(
             img,
-            padding=(l_pad, t_pad, r_pad, b_pad),
-            fill=tuple(int(x * 255) for x in self.image_processor.image_mean),
+            (l_pad, r_pad, t_pad, b_pad),
         )
+        # originally:
+        # resized = F.pad(
+        #     img,
+        #     padding=(l_pad, t_pad, r_pad, b_pad),
+        #     fill=tuple(int(x * 255) for x in self.image_mean),
+        # )
+
         # TODO: implement _upsample_bicubic_aa.out in portable kernel library.
         # here padded shape should be max(h, w) x max(h, w)
         # skipping resize for now due to missing _upsample_bicubic_aa kernel in portable
@@ -287,13 +298,12 @@ def get_example_inputs(self):
         """Returns a resized image as input to model.forward()."""
         if self.resized_image:
             return self.resized_image
-        imagr = torchvision.transforms.functional.pil_to_tensor(self.image)
-        ratio = (
-            max(imagr.shape[1], imagr.shape[2])
-            / self.image_processor.crop_size["height"]
+        resized = prepare_image(
+            self.image,
+            self.image_processor.crop_size["height"],
+            self.image_processor.crop_size["width"],
         )
-        output_size = (int(imagr.shape[1] / ratio), int(imagr.shape[2] / ratio))
-        self.resized_image = (torchvision.transforms.Resize(size=output_size)(imagr),)
+        self.resized_image = (resized,)
         return self.resized_image
 
     def get_inputs_for_prefill(self):
@@ -317,8 +327,13 @@ def get_dynamic_shapes(self):
         return self._get_image_dynamic_shapes()
 
     def _get_image_dynamic_shapes(self):
-        height = Dim("height", min=8, max=336)
-        width = Dim("width", min=28, max=336)
+        # only support even number of height and width for now
+        _height = Dim(
+            "_height", min=1, max=self.image_processor.crop_size["height"] // 2
+        )
+        _width = Dim("_width", min=1, max=self.image_processor.crop_size["width"] // 2)
+        height = 2 * _height
+        width = 2 * _width
         dynamic_shapes = [{1: height, 2: width}]
         return dynamic_shapes
 
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
@@ -8,9 +8,10 @@
 import sys
 
 import torch
-
+from executorch.examples.models.llava.image_util import prepare_image
 from executorch.examples.models.llava.model import LlavaModel
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
+from PIL import Image
 
 # Custom ops has to be loaded after portable_lib.
 # I don't know how to stop UFMT so I'm just using if True: to avoid lint error
@@ -24,13 +25,23 @@
 
 def main():
     args = sys.argv[1:]
+    if len(args) == 0:
+        print(
+            "Usage: python test_pte.py <model_path> <image_path?>. If no image, will use default image."
+        )
+        sys.exit(1)
+
     llava_module = _load_for_executorch(args[0])
 
     llava_model = LlavaModel()
 
     prompt_before_image, resized, prompt_after_image = (
         llava_model.get_inputs_for_prefill()
     )
+    if len(args) == 2:
+        image_path = args[1]
+        image = Image.open(image_path)
+        resized = prepare_image(image, target_h=336, target_w=336)
 
     start_pos = 0
     # pte prefill prompt before img