pad_max_tiles (#5271)

lucylq · facebook-github-bot · commit c605bae0bff3 · 2024-09-16T22:07:13.000-07:00
Summary: Land after: meta-pytorch/torchtune#1541 Propagate changes for `pad_max_tile` in ExecuTorch. Pull Request resolved: #5271 Test Plan: ``` python -m unittest examples/models/flamingo/preprocess/test_preprocess.py ``` Reviewed By: tarun292 Differential Revision: D62664973 Pulled By: lucylq fbshipit-source-id: eb1523d741220a65c6e6dc4ab5342f2bbe5dc807
diff --git a/examples/models/flamingo/preprocess/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py
@@ -43,6 +43,7 @@ def export_preprocess(
     max_num_tiles: int = 4,
     tile_size: int = 224,
     antialias: bool = False,
+    pad_max_tiles: bool = True,
 ) -> ExportedProgram:
 
     # Instantiate eager model.
@@ -53,6 +54,7 @@ def export_preprocess(
         max_num_tiles=max_num_tiles,
         tile_size=tile_size,
         antialias=antialias,
+        pad_max_tiles=pad_max_tiles,
     )
 
     # Replace non-exportable ops with custom ops.
diff --git a/examples/models/flamingo/preprocess/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py
@@ -54,6 +54,7 @@ class PreprocessConfig:
     tile_size: int = 224
     max_num_tiles: int = 4
     possible_resolutions = None
+    pad_max_tiles: bool = True
 
 
 class TestImageTransform(unittest.TestCase):
@@ -136,6 +137,17 @@ def prepare_inputs(
                 [1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0],  # expected_tile_min
                 [1, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (100, 400, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.2230, 0.1763, 0.0, 0.0],  # expected_tile_means
+                [1.0, 1.0, 0.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [1, 2],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
             (
                 (1000, 300, 3),  # image_size
@@ -145,6 +157,7 @@ def prepare_inputs(
                 [0.9976, 0.9940, 0.9936, 0.9906],  # expected_tile_max
                 [0.0037, 0.0047, 0.0039, 0.0],  # expected_tile_min
                 [4, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (200, 200, 3),  # image_size
@@ -154,6 +167,7 @@ def prepare_inputs(
                 [0.9921, 0.9925, 0.9969, 0.9908],  # expected_tile_max
                 [0.0056, 0.0069, 0.0059, 0.0032],  # expected_tile_min
                 [2, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (600, 200, 3),  # image_size
@@ -163,6 +177,17 @@ def prepare_inputs(
                 [1.0, 1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0, 0.0],  # expected_tile_min
                 [3, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (600, 200, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.4472, 0.4468, 0.3031, 0.0],  # expected_tile_means
+                [1.0, 1.0, 1.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [3, 1],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
         ]
     )
@@ -175,8 +200,11 @@ def test_preprocess(
         expected_tile_max: List[float],
         expected_tile_min: List[float],
         expected_ar: List[int],
+        pad_max_tiles: bool,
     ) -> None:
-        config = PreprocessConfig(resize_to_max_canvas=resize_to_max_canvas)
+        config = PreprocessConfig(
+            resize_to_max_canvas=resize_to_max_canvas, pad_max_tiles=pad_max_tiles
+        )
 
         reference_model = CLIPImageTransform(
             image_mean=config.image_mean,
@@ -187,6 +215,7 @@ def test_preprocess(
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
             possible_resolutions=None,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         eager_model = _CLIPImageTransform(
@@ -196,6 +225,7 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         exported_model = export_preprocess(
@@ -205,6 +235,7 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         executorch_model = lower_to_executorch_preprocess(exported_model)
@@ -244,8 +275,11 @@ def test_preprocess(
             self.assertAlmostEqual(tile.min().item(), expected_tile_min[i], delta=1e-4)
 
         # Check num tiles matches the product of the aspect ratio.
-        expected_num_tiles = reference_ar[0] * reference_ar[1]
-        self.assertEqual(expected_num_tiles, reference_image.shape[0])
+        if pad_max_tiles:
+            self.assertEqual(config.max_num_tiles, reference_image.shape[0])
+        else:
+            expected_num_tiles = reference_ar[0] * reference_ar[1]
+            self.assertEqual(expected_num_tiles, reference_image.shape[0])
 
         # Pre-work for eager and exported models. The reference model performs these
         # calculations and passes the result to _CLIPImageTransform, the exportable model.