make vit compatible with non square images (#2255)

sineeli · web-flow · commit 191a804cc8f6 · 2025-05-28T11:48:42.000-07:00
* make vit compatible with non square images

* fix converter issue

* update presets

* use std for scale looping not mean

* patch size can also be int dtype
diff --git a/keras_hub/src/models/vit/vit_backbone.py b/keras_hub/src/models/vit/vit_backbone.py
@@ -18,10 +18,10 @@ class ViTBackbone(Backbone):
 
     Args:
         image_shape: A tuple or list of 3 integers representing the shape of the
-            input image `(height, width, channels)`, `height` and `width` must
-            be equal.
-        patch_size: int. The size of each image patch, the input image will be
-            divided into patches of shape `(patch_size, patch_size)`.
+            input image `(height, width, channels)`.
+        patch_size: int or (int, int). The size of each image patch, the input
+            image will be divided into patches of shape
+            `(patch_size_h, patch_size_w)`.
         num_layers: int. The number of transformer encoder layers.
         num_heads: int. specifying the number of attention heads in each
             Transformer encoder layer.
@@ -37,6 +37,10 @@ class ViTBackbone(Backbone):
         use_mha_bias: bool. Whether to use bias in the multi-head
             attention layers.
         use_mlp_bias: bool. Whether to use bias in the MLP layers.
+        use_class_token: bool. Whether to use class token to be part of
+            patch embedding. Defaults to `True`.
+        use_patch_bias: bool. Whether to use bias in Conv2d of patch embedding
+            layer. Defaults to `True`.
         data_format: str.  `"channels_last"` or `"channels_first"`, specifying
             the data format for the input image. If `None`, defaults to
             `"channels_last"`.
@@ -58,6 +62,8 @@ def __init__(
         layer_norm_epsilon=1e-6,
         use_mha_bias=True,
         use_mlp_bias=True,
+        use_class_token=True,
+        use_patch_bias=True,
         data_format=None,
         dtype=None,
         **kwargs,
@@ -74,24 +80,34 @@ def __init__(
                 f"at index {h_axis} (height) or {w_axis} (width). "
                 f"Image shape: {image_shape}"
             )
-        if image_shape[h_axis] != image_shape[w_axis]:
+
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+
+        if image_shape[h_axis] % patch_size[0] != 0:
+            raise ValueError(
+                f"Input height {image_shape[h_axis]} should be divisible by "
+                f"patch size {patch_size[0]}."
+            )
+
+        if image_shape[w_axis] % patch_size[1] != 0:
             raise ValueError(
-                f"Image height and width must be equal. Found height: "
-                f"{image_shape[h_axis]}, width: {image_shape[w_axis]} at "
-                f"indices {h_axis} and {w_axis} respectively. Image shape: "
-                f"{image_shape}"
+                f"Input width {image_shape[h_axis]} should be divisible by "
+                f"patch size {patch_size[1]}."
             )
 
         num_channels = image_shape[channels_axis]
 
         # === Functional Model ===
-        inputs = keras.layers.Input(shape=image_shape)
+        inputs = keras.layers.Input(shape=image_shape, name="images")
 
         x = ViTPatchingAndEmbedding(
-            image_size=image_shape[h_axis],
+            image_size=(image_shape[h_axis], image_shape[w_axis]),
             patch_size=patch_size,
             hidden_dim=hidden_dim,
             num_channels=num_channels,
+            use_class_token=use_class_token,
+            use_patch_bias=use_patch_bias,
             data_format=data_format,
             dtype=dtype,
             name="vit_patching_and_embedding",
@@ -130,6 +146,8 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.use_mha_bias = use_mha_bias
         self.use_mlp_bias = use_mlp_bias
+        self.use_class_token = use_class_token
+        self.use_patch_bias = use_patch_bias
         self.data_format = data_format
 
     def get_config(self):
@@ -147,6 +165,8 @@ def get_config(self):
                 "layer_norm_epsilon": self.layer_norm_epsilon,
                 "use_mha_bias": self.use_mha_bias,
                 "use_mlp_bias": self.use_mlp_bias,
+                "use_class_token": self.use_class_token,
+                "use_patch_bias": self.use_patch_bias,
             }
         )
         return config
diff --git a/keras_hub/src/models/vit/vit_backbone_test.py b/keras_hub/src/models/vit/vit_backbone_test.py
@@ -9,7 +9,7 @@ class ViTBackboneTest(TestCase):
     def setUp(self):
         self.init_kwargs = {
             "image_shape": (28, 28, 3),
-            "patch_size": 4,
+            "patch_size": (4, 4),
             "num_layers": 3,
             "hidden_dim": 48,
             "num_heads": 6,
@@ -25,7 +25,15 @@ def test_backbone_basics(self):
             init_kwargs={**self.init_kwargs},
             input_data=self.input_data,
             expected_output_shape=(2, 50, 48),
-            run_quantization_check=False,
+        )
+
+    def test_backbone_basics_without_class_token(self):
+        self.init_kwargs["use_class_token"] = False
+        self.run_backbone_test(
+            cls=ViTBackbone,
+            init_kwargs={**self.init_kwargs},
+            input_data=self.input_data,
+            expected_output_shape=(2, 49, 48),
         )
 
     @pytest.mark.large
diff --git a/keras_hub/src/models/vit/vit_image_classifier_test.py b/keras_hub/src/models/vit/vit_image_classifier_test.py
@@ -16,7 +16,7 @@ def setUp(self):
         self.labels = [0, 1]
         self.backbone = ViTBackbone(
             image_shape=(28, 28, 3),
-            patch_size=4,
+            patch_size=(4, 4),
             num_layers=3,
             num_heads=6,
             hidden_dim=48,
diff --git a/keras_hub/src/models/vit/vit_image_converter.py b/keras_hub/src/models/vit/vit_image_converter.py
@@ -1,78 +1,8 @@
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
 from keras_hub.src.models.vit.vit_backbone import ViTBackbone
-from keras_hub.src.utils.tensor_utils import preprocessing_function
 
 
 @keras_hub_export("keras_hub.layers.ViTImageConverter")
 class ViTImageConverter(ImageConverter):
-    """Converts images to the format expected by a ViT model.
-
-    This layer performs image normalization using mean and standard deviation
-    values. By default, it uses the same normalization as the
-    "google/vit-large-patch16-224" model on Hugging Face:
-    `norm_mean=[0.5, 0.5, 0.5]` and `norm_std=[0.5, 0.5, 0.5]`
-    ([reference](https://huggingface.co/google/vit-large-patch16-224/blob/main/preprocessor_config.json)).
-    These defaults are suitable for models pretrained using this normalization.
-
-    Args:
-        norm_mean: list or tuple of floats. Mean values for image normalization.
-            Defaults to `[0.5, 0.5, 0.5]`.
-        norm_std: list or tuple of floats. Standard deviation values for
-            image normalization. Defaults to `[0.5, 0.5, 0.5]`.
-        **kwargs: Additional keyword arguments passed to
-            `keras_hub.layers.preprocessing.ImageConverter`.
-
-    Examples:
-    ```python
-    import keras
-    import numpy as np
-    from keras_hub.src.layers import ViTImageConverter
-
-    # Example image (replace with your actual image data)
-    image = np.random.rand(1, 224, 224, 3)  # Example: (B, H, W, C)
-
-    # Create a ViTImageConverter instance
-    converter = ViTImageConverter(
-        image_size=(28,28),
-        scale=1/255.
-    )
-    # Preprocess the image
-    preprocessed_image = converter(image)
-    ```
-    """
-
     backbone_cls = ViTBackbone
-
-    def __init__(
-        self, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5], **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.norm_mean = norm_mean
-        self.norm_std = norm_std
-
-    @preprocessing_function
-    def call(self, inputs):
-        # TODO: Remove this whole function. Why can just use scale and offset
-        # in the base class.
-        x = super().call(inputs)
-        if self.norm_mean:
-            norm_mean = self._expand_non_channel_dims(self.norm_mean, x)
-            x, norm_mean = self._convert_types(x, norm_mean, self.compute_dtype)
-            x = x - norm_mean
-        if self.norm_std:
-            norm_std = self._expand_non_channel_dims(self.norm_std, x)
-            x, norm_std = self._convert_types(x, norm_std, x.dtype)
-            x = x / norm_std
-
-        return x
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "norm_mean": self.norm_mean,
-                "norm_std": self.norm_std,
-            }
-        )
-        return config
diff --git a/keras_hub/src/models/vit/vit_layers.py b/keras_hub/src/models/vit/vit_layers.py
@@ -75,12 +75,13 @@ class ViTPatchingAndEmbedding(keras.layers.Layer):
     """Patches the image and embeds the patches.
 
     Args:
-        image_size: int. Size of the input image (height or width).
-            Assumed to be square.
-        patch_size: int. Size of each image patch.
+        image_size: (int, int). Size of the input image.
+        patch_size: (int, int). Size of each image patch.
         hidden_dim: int. Dimensionality of the patch embeddings.
         num_channels: int. Number of channels in the input image. Defaults to
             `3`.
+        use_class_token: bool. Whether to use class token to be part of
+            patch embedding. Defaults to `True`.
         data_format: str. `"channels_last"` or `"channels_first"`. Defaults to
             `None` (which uses `"channels_last"`).
         **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
@@ -92,12 +93,15 @@ def __init__(
         patch_size,
         hidden_dim,
         num_channels=3,
+        use_class_token=True,
+        use_patch_bias=True,
         data_format=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        num_patches = (image_size // patch_size) ** 2
-        num_positions = num_patches + 1
+        grid_size = tuple([s // p for s, p in zip(image_size, patch_size)])
+        num_patches = grid_size[0] * grid_size[1]
+        num_positions = num_patches + 1 if use_class_token else num_patches
 
         # === Config ===
         self.image_size = image_size
@@ -106,19 +110,22 @@ def __init__(
         self.num_channels = num_channels
         self.num_patches = num_patches
         self.num_positions = num_positions
+        self.use_class_token = use_class_token
+        self.use_patch_bias = use_patch_bias
         self.data_format = standardize_data_format(data_format)
 
     def build(self, input_shape):
-        self.class_token = self.add_weight(
-            shape=(
-                1,
-                1,
-                self.hidden_dim,
-            ),
-            initializer="random_normal",
-            dtype=self.variable_dtype,
-            name="class_token",
-        )
+        if self.use_class_token:
+            self.class_token = self.add_weight(
+                shape=(
+                    1,
+                    1,
+                    self.hidden_dim,
+                ),
+                initializer="random_normal",
+                dtype=self.variable_dtype,
+                name="class_token",
+            )
         self.patch_embedding = keras.layers.Conv2D(
             filters=self.hidden_dim,
             kernel_size=self.patch_size,
@@ -127,6 +134,7 @@ def build(self, input_shape):
             activation=None,
             dtype=self.dtype_policy,
             data_format=self.data_format,
+            use_bias=self.use_patch_bias,
             name="patch_embedding",
         )
         self.patch_embedding.build(input_shape)
@@ -153,10 +161,16 @@ def call(self, inputs):
         patch_embeddings = ops.reshape(
             patch_embeddings, [embeddings_shape[0], -1, embeddings_shape[-1]]
         )
-        class_token = ops.tile(self.class_token, (embeddings_shape[0], 1, 1))
         position_embeddings = self.position_embedding(self.position_ids)
-        embeddings = ops.concatenate([class_token, patch_embeddings], axis=1)
-        return ops.add(embeddings, position_embeddings)
+
+        if self.use_class_token:
+            class_token = ops.tile(
+                self.class_token, (embeddings_shape[0], 1, 1)
+            )
+            patch_embeddings = ops.concatenate(
+                [class_token, patch_embeddings], axis=1
+            )
+        return ops.add(patch_embeddings, position_embeddings)
 
     def compute_output_shape(self, input_shape):
         return (
@@ -175,6 +189,7 @@ def get_config(self):
                 "num_channels": self.num_channels,
                 "num_patches": self.num_patches,
                 "num_positions": self.num_positions,
+                "use_class_token": self.use_class_token,
             }
         )
         return config
diff --git a/keras_hub/src/models/vit/vit_presets.py b/keras_hub/src/models/vit/vit_presets.py
diff --git a/tools/checkpoint_conversion/convert_vit_checkpoints.py b/tools/checkpoint_conversion/convert_vit_checkpoints.py