From dbe4df47a3adf7c04d2b4789f0662ddeb3df6e0b Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Mon, 28 Oct 2024 15:11:00 -0700
Subject: [PATCH] refactor preprocess to use EagerModelBase

[ghstack-poisoned]
---
 .github/workflows/pull.yml                    | 25 ++++++
 .../preprocess/export_preprocess.py           | 51 +++++++----
 .../preprocess/export_preprocess_lib.py       | 85 -------------------
 .../llama3_2_vision/preprocess/model.py       | 72 ++++++++++++++++
 .../preprocess/test_preprocess.py             | 78 +++++++----------
 5 files changed, 162 insertions(+), 149 deletions(-)
 delete mode 100644 examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py
 create mode 100644 examples/models/llama3_2_vision/preprocess/model.py

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 144c2be0e87..3a3d11d344b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -231,6 +231,31 @@ jobs:
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
 
+  test-preprocess-linux:
+    name: test-preprocess-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # run python unittest
+        python -m unittest examples.models.llama3_2_vision.preprocess.test_preprocess
+
+
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess.py b/examples/models/llama3_2_vision/preprocess/export_preprocess.py
index 58c79095074..d82f79c2f35 100644
--- a/examples/models/llama3_2_vision/preprocess/export_preprocess.py
+++ b/examples/models/llama3_2_vision/preprocess/export_preprocess.py
@@ -5,28 +5,47 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from executorch.examples.models.llama3_2_vision.preprocess.export_preprocess_lib import (
-    export_preprocess,
-    get_example_inputs,
-    lower_to_executorch_preprocess,
+from executorch.examples.models.llama3_2_vision.preprocess.model import (
+    CLIPImageTransformModel,
+    PreprocessConfig,
 )
+from executorch.exir import EdgeCompileConfig, to_edge
 
 
 def main():
+    # Eager model.
+    model = CLIPImageTransformModel(PreprocessConfig())
 
-    # ExecuTorch
-    ep_et = export_preprocess()
-    et = lower_to_executorch_preprocess(ep_et)
-    with open("preprocess_et.pte", "wb") as file:
-        et.write_to_file(file)
-
-    # AOTInductor
-    ep_aoti = export_preprocess()
-    torch._inductor.aot_compile(
-        ep_aoti.module(),
-        get_example_inputs(),
-        options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    # Export.
+    ep = torch.export.export(
+        model.get_eager_model(),
+        model.get_example_inputs(),
+        dynamic_shapes=model.get_dynamic_shapes(),
+        strict=False,
+    )
+
+    # Executorch
+    edge_program = to_edge(
+        ep, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
+    et_program = edge_program.to_executorch()
+    with open("preprocess_et.pte", "wb") as file:
+        et_program.write_to_file(file)
+
+    # Export.
+    # ep = torch.export.export(
+    #     model.get_eager_model(),
+    #     model.get_example_inputs(),
+    #     dynamic_shapes=model.get_dynamic_shapes(),
+    #     strict=False,
+    # )
+    #
+    # # AOTInductor
+    # torch._inductor.aot_compile(
+    #     ep.module(),
+    #     model.get_example_inputs(),
+    #     options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    # )
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py b/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py
deleted file mode 100644
index f3fe8188c04..00000000000
--- a/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, List, Optional, Tuple
-
-import torch
-from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
-from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
-from executorch.exir.program._program import ExecutorchProgramManager
-
-from executorch.extension.llm.custom_ops import op_tile_crop_aot  # noqa
-
-from torch.export import Dim, ExportedProgram
-from torchtune.models.clip.inference._transform import _CLIPImageTransform
-
-
-def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    image = torch.ones(3, 800, 600)
-    target_size = torch.tensor([448, 336])
-    canvas_size = torch.tensor([448, 448])
-    return (image, target_size, canvas_size)
-
-
-def get_dynamic_shapes() -> Dict[str, Dict[int, Dim]]:
-    img_h = Dim("img_h", min=1, max=4000)
-    img_w = Dim("img_w", min=1, max=4000)
-
-    dynamic_shapes = {
-        "image": {1: img_h, 2: img_w},
-        "target_size": None,
-        "canvas_size": None,
-    }
-    return dynamic_shapes
-
-
-def export_preprocess(
-    resample: str = "bilinear",
-    image_mean: Optional[List[float]] = None,
-    image_std: Optional[List[float]] = None,
-    max_num_tiles: int = 4,
-    tile_size: int = 224,
-    antialias: bool = False,
-) -> ExportedProgram:
-
-    # Instantiate eager model.
-    image_transform_model = _CLIPImageTransform(
-        resample=resample,
-        image_mean=image_mean,
-        image_std=image_std,
-        max_num_tiles=max_num_tiles,
-        tile_size=tile_size,
-        antialias=antialias,
-    )
-
-    # Replace non-exportable ops with custom ops.
-    image_transform_model.tile_crop = torch.ops.preprocess.tile_crop.default
-
-    # Export.
-    example_inputs = get_example_inputs()
-    dynamic_shapes = get_dynamic_shapes()
-    ep = torch.export.export(
-        image_transform_model,
-        example_inputs,
-        dynamic_shapes=dynamic_shapes,
-        strict=False,
-    )
-    return ep
-
-
-def lower_to_executorch_preprocess(
-    exported_program: ExportedProgram,
-) -> ExecutorchProgramManager:
-    edge_program = to_edge(
-        exported_program, compile_config=EdgeCompileConfig(_check_ir_validity=False)
-    )
-
-    et_program = edge_program.to_executorch(
-        ExecutorchBackendConfig(
-            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
-        )
-    )
-    return et_program
diff --git a/examples/models/llama3_2_vision/preprocess/model.py b/examples/models/llama3_2_vision/preprocess/model.py
new file mode 100644
index 00000000000..ec170a6cd7c
--- /dev/null
+++ b/examples/models/llama3_2_vision/preprocess/model.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from executorch.extension.llm.custom_ops import op_tile_crop_aot  # noqa
+from torch.export import Dim
+from torchtune.models.clip.inference._transform import _CLIPImageTransform
+
+from ...model_base import EagerModelBase
+
+
+@dataclass
+class PreprocessConfig:
+    image_mean: Optional[List[float]] = None
+    image_std: Optional[List[float]] = None
+    resample: str = "bilinear"
+    max_num_tiles: int = 4
+    tile_size: int = 224
+    antialias: bool = False
+    # Used for eager.
+    resize_to_max_canvas: bool = True
+    possible_resolutions: Optional[List[Tuple[int, int]]] = None
+
+
+class CLIPImageTransformModel(EagerModelBase):
+    def __init__(
+        self,
+        config: PreprocessConfig,
+    ):
+        super().__init__()
+
+        # Eager model.
+        self.model = _CLIPImageTransform(
+            image_mean=config.image_mean,
+            image_std=config.image_std,
+            resample=config.resample,
+            max_num_tiles=config.max_num_tiles,
+            tile_size=config.tile_size,
+            antialias=config.antialias,
+        )
+
+        # Replace non-exportable ops with custom ops.
+        self.model.tile_crop = torch.ops.preprocess.tile_crop.default
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return self.model
+
+    def get_example_inputs(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        image = torch.ones(3, 800, 600)
+        target_size = torch.tensor([448, 336])
+        canvas_size = torch.tensor([448, 448])
+        return (image, target_size, canvas_size)
+
+    def get_dynamic_shapes(self) -> Dict[str, Dict[int, Dim]]:
+        img_h = Dim("img_h", min=1, max=4000)
+        img_w = Dim("img_w", min=1, max=4000)
+
+        dynamic_shapes = {
+            "image": {1: img_h, 2: img_w},
+            "target_size": None,
+            "canvas_size": None,
+        }
+        return dynamic_shapes
diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
index 73a3fd29607..225226ee9e7 100644
--- a/examples/models/llama3_2_vision/preprocess/test_preprocess.py
+++ b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
@@ -6,20 +6,22 @@
 
 import unittest
 
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import numpy as np
 import PIL
 import torch
 
-from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
-from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
-from executorch.examples.models.llama3_2_vision.preprocess.export_preprocess_lib import (
-    export_preprocess,
-    get_example_inputs,
-    lower_to_executorch_preprocess,
+from executorch.examples.models.llama3_2_vision.preprocess.model import (
+    CLIPImageTransformModel,
+    PreprocessConfig,
 )
+
+from executorch.exir import EdgeCompileConfig, to_edge
+
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+from executorch.extension.llm.custom_ops import op_tile_crop_aot  # noqa # usort: skip
+
 from executorch.extension.pybindings.portable_lib import (
     _load_for_executorch_from_buffer,
 )
@@ -27,10 +29,7 @@
 from parameterized import parameterized
 from PIL import Image
 
-from torchtune.models.clip.inference._transform import (
-    _CLIPImageTransform,
-    CLIPImageTransform,
-)
+from torchtune.models.clip.inference._transform import CLIPImageTransform
 
 from torchtune.modules.transforms.vision_utils.get_canvas_best_fit import (
     find_supported_resolutions,
@@ -43,18 +42,6 @@
 from torchvision.transforms.v2 import functional as F
 
 
-@dataclass
-class PreprocessConfig:
-    image_mean: Optional[List[float]] = None
-    image_std: Optional[List[float]] = None
-    resize_to_max_canvas: bool = True
-    resample: str = "bilinear"
-    antialias: bool = False
-    tile_size: int = 224
-    max_num_tiles: int = 4
-    possible_resolutions = None
-
-
 class TestImageTransform(unittest.TestCase):
     """
     This unittest checks that the exported image transform model produces the
@@ -188,31 +175,26 @@ def test_preprocess(
             possible_resolutions=None,
         )
 
-        eager_model = _CLIPImageTransform(
-            image_mean=config.image_mean,
-            image_std=config.image_std,
-            resample=config.resample,
-            antialias=config.antialias,
-            tile_size=config.tile_size,
-            max_num_tiles=config.max_num_tiles,
-        )
+        model = CLIPImageTransformModel(config)
+        eager_model = model.get_eager_model()
 
-        exported_model = export_preprocess(
-            image_mean=config.image_mean,
-            image_std=config.image_std,
-            resample=config.resample,
-            antialias=config.antialias,
-            tile_size=config.tile_size,
-            max_num_tiles=config.max_num_tiles,
+        exported_model = torch.export.export(
+            eager_model,
+            model.get_example_inputs(),
+            dynamic_shapes=model.get_dynamic_shapes(),
+            strict=False,
         )
 
-        executorch_model = lower_to_executorch_preprocess(exported_model)
+        edge_program = to_edge(
+            exported_model, compile_config=EdgeCompileConfig(_check_ir_validity=False)
+        )
+        executorch_model = edge_program.to_executorch()
         executorch_module = _load_for_executorch_from_buffer(executorch_model.buffer)
 
-        aoti_path = torch._inductor.aot_compile(
-            exported_model.module(),
-            get_example_inputs(),
-        )
+        # aoti_path = torch._inductor.aot_compile(
+        #     exported_model.module(),
+        #     get_example_inputs(),
+        # )
 
         # Prepare image input.
         image = (
@@ -276,7 +258,7 @@ def test_preprocess(
         self.assertEqual(reference_ar, et_ar.tolist())
 
         # Run aoti model and check it matches reference model.
-        aoti_model = torch._export.aot_load(aoti_path, "cpu")
-        aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
-        self.assertTrue(torch.allclose(reference_image, aoti_image))
-        self.assertEqual(reference_ar, aoti_ar.tolist())
+        # aoti_model = torch._export.aot_load(aoti_path, "cpu")
+        # aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
+        # self.assertTrue(torch.allclose(reference_image, aoti_image))
+        # self.assertEqual(reference_ar, aoti_ar.tolist())