[SAM] make saving to TF saved_model work (#81)

freedomtan · web-flow · commit e534520a4c9f · 2023-05-01T23:40:52.000+01:00
Ensure we only use TF operations in the forward operations for the models to enable saving and loading models.
diff --git a/tests/models/test_segment_anything.py b/tests/models/test_segment_anything.py
@@ -1,3 +1,4 @@
+import tempfile
 from typing import Tuple, cast
 
 import numpy as np
@@ -43,7 +44,7 @@
     TwoWayTransformer as PTTwoWayTransformer,
 )
 from tfimm.architectures.segment_anything.transformer import (
-    Attention as TFAttention,
+    DownsampleAttention as TFAttention,
     TwoWayAttentionBlock as TFTwoWayAttentionBlock,
     TwoWayTransformer as TFTwoWayTransformer,
 )
@@ -468,3 +469,23 @@ def test_predictor(fixed_input_size):
     masks, scores, logits = predictor(points=[[10, 10]], multimask_output=False)
 
     assert masks.shape == (1, *img.shape[:2])
+
+
+# This test takes longer, because the model is quite complex.
+@pytest.mark.timeout(120)
+def test_save_load_model():
+    """Tests ability to use keras save() and load() functions."""
+    model = create_model("sam_vit_test_model")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model.save(tmpdir)
+        loaded_model = tf.keras.models.load_model(tmpdir, compile=False)
+
+    assert type(model) is type(loaded_model)
+
+    inputs = model.dummy_inputs
+    m_1, s_1, l_1 = model(inputs)
+    m_2, s_2, l_2 = loaded_model(inputs)
+
+    assert np.sum(m_1.numpy() != m_2.numpy()) == 0
+    assert (np.max(np.abs(s_1.numpy() - s_2.numpy()))) < 1e-6
+    assert (np.max(np.abs(l_1.numpy() - l_2.numpy()))) < 1e-6
diff --git a/tfimm/architectures/segment_anything/image_encoder.py b/tfimm/architectures/segment_anything/image_encoder.py
@@ -58,7 +58,7 @@ def window_unpartition(
         x: Unpartitioned tensor of shape (B, H, W, C).
     """
     hp, wp = pad_hw
-    h, w = hw
+    h, w = hw[0], hw[1]
     window_size = tf.shape(windows)[1]
     nb_windows = (hp // window_size) * (wp // window_size)
     n = tf.shape(windows)[0] // nb_windows
@@ -93,7 +93,7 @@ def get_rel_pos(
         Extracted positional embeddings according to relative positions.
     """
     m = tf.shape(rel_pos)[0]
-    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    max_rel_dist = tf.cast(2 * tf.math.maximum(q_size, k_size) - 1, tf.int32)
 
     if interpolate_pos:
         # Interpolate positional embeddings if needed.
@@ -108,10 +108,10 @@ def get_rel_pos(
     q_coords = tf.expand_dims(tf.range(q_size, dtype=tf.float32), axis=-1)
     k_coords = tf.expand_dims(tf.range(k_size, dtype=tf.float32), axis=0)
     # Scale the coords with short length if shapes for q and k are different.
-    q_coords = q_coords * tf.cast(max(k_size / q_size, 1.0), tf.float32)
-    k_coords = k_coords * tf.cast(max(q_size / k_size, 1.0), tf.float32)
+    q_coords = q_coords * tf.cast(tf.math.maximum(k_size / q_size, 1.0), tf.float32)
+    k_coords = k_coords * tf.cast(tf.math.maximum(q_size / k_size, 1.0), tf.float32)
 
-    lambda_ = tf.cast(max(q_size / k_size, 1.0), tf.float32)
+    lambda_ = tf.cast(tf.math.maximum(q_size / k_size, 1.0), tf.float32)
     offset = tf.cast(k_size - 1, tf.float32) * lambda_
     relative_coords = (q_coords - k_coords) + offset
     relative_coords = tf.cast(relative_coords, tf.int32)
@@ -168,7 +168,7 @@ def add_decomposed_rel_pos(
     return attn
 
 
-class Attention(tf.keras.layers.Layer):
+class RelPosAttention(tf.keras.layers.Layer):
     """Multi-head Attention block with relative position embeddings."""
 
     def __init__(
@@ -263,7 +263,7 @@ def call(self, x, training=False):
         return x
 
 
-class Block(tf.keras.layers.Layer):
+class ImageEncoderBlock(tf.keras.layers.Layer):
     """
     Transformer blocks with support for window attention and residual propagation.
     """
@@ -316,7 +316,7 @@ def __init__(
         norm_layer = norm_layer_factory(norm_layer)
 
         self.norm1 = norm_layer(name="norm1")
-        self.attn = Attention(
+        self.attn = RelPosAttention(
             fixed_input_size=self.fixed_input_size,
             embed_dim=self.embed_dim,
             nb_heads=self.nb_heads,
@@ -438,7 +438,7 @@ def __init__(
         self.pos_embed = None
 
         self.blocks = [
-            Block(
+            ImageEncoderBlock(
                 fixed_input_size=self.fixed_input_size,
                 embed_dim=self.embed_dim,
                 nb_heads=self.nb_heads,
diff --git a/tfimm/architectures/segment_anything/transformer.py b/tfimm/architectures/segment_anything/transformer.py
@@ -38,7 +38,7 @@ def __init__(
             )
             for j in range(self.nb_blocks)
         ]
-        self.final_attn_token_to_image = Attention(
+        self.final_attn_token_to_image = DownsampleAttention(
             embed_dim=self.embed_dim,
             nb_heads=self.nb_heads,
             downsample_rate=self.attention_downsample_rate,
@@ -131,12 +131,12 @@ def __init__(
 
         norm_layer = norm_layer_factory("layer_norm")
 
-        self.self_attn = Attention(
+        self.self_attn = DownsampleAttention(
             embed_dim=embed_dim, nb_heads=nb_heads, downsample_rate=1, name="self_attn"
         )
         self.norm1 = norm_layer(name="norm1")
 
-        self.cross_attn_token_to_image = Attention(
+        self.cross_attn_token_to_image = DownsampleAttention(
             embed_dim=embed_dim,
             nb_heads=nb_heads,
             downsample_rate=attention_downsample_rate,
@@ -153,7 +153,7 @@ def __init__(
         )
         self.norm3 = norm_layer(name="norm3")
 
-        self.cross_attn_image_to_token = Attention(
+        self.cross_attn_image_to_token = DownsampleAttention(
             embed_dim=embed_dim,
             nb_heads=nb_heads,
             downsample_rate=attention_downsample_rate,
@@ -194,7 +194,7 @@ def call(self, inputs, training=False):
         return q, k
 
 
-class Attention(tf.keras.layers.Layer):
+class DownsampleAttention(tf.keras.layers.Layer):
     """
     An attention layer that allows for downscaling the size of the embedding after
     projection to queries, keys, and values.
@@ -221,7 +221,7 @@ def __init__(
         )
 
     def _separate_heads(self, x: tf.Tensor):
-        b, m, c = tf.shape(x)  # (B, M, C)
+        b, m, c = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]  # (B, M, C)
         x = tf.reshape(x, (b, m, self.nb_heads, c // self.nb_heads))  # (B, M, Hd, C/Hd)
         x = tf.transpose(x, (0, 2, 1, 3))  # (B, Hd, M, C/Hd)
         return x