Use jagged layout NT through the MaskDecoder (#45)

jbschlosser · web-flow · commit a60de075b23f · 2023-10-19T19:24:50.000-07:00
diff --git a/segment_anything_fast/modeling/common.py b/segment_anything_fast/modeling/common.py
@@ -36,8 +36,8 @@ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
         self.eps = eps
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        u = x.mean(1, keepdim=True)
-        s = (x - u).pow(2).mean(1, keepdim=True)
+        u = x.mean(-3, keepdim=True)
+        s = (x - u).pow(2).mean(-3, keepdim=True)
         x = (x - u) / torch.sqrt(s + self.eps)
         x = self.weight[:, None, None] * x + self.bias[:, None, None]
         return x
diff --git a/segment_anything_fast/modeling/mask_decoder.py b/segment_anything_fast/modeling/mask_decoder.py
@@ -180,38 +180,29 @@ def predict_masks_nested(
             torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0))
         tokens = torch.cat([output_tokens, sparse_prompt_embeddings], dim=2)
 
-        # TODO: remove this and make sure offsets are propagated
-        offsets = tokens.offsets()
-
         src = dense_prompt_embeddings + image_embeddings.unsqueeze(1)
         pos_src = torch.zeros_like(src) + image_pe
-        b, c, h, w = src.values().shape
+        h, w = src.shape[-2:]
 
         # Run the transformer
-        # TODO: Run the full NTs through instead of just the buffers
-        hs, src = self.transformer(src.values(), pos_src.values(), tokens.values())
-        iou_token_out = hs[:, 0, :]
-        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[..., 0, :]
+        mask_tokens_out = hs[..., 1 : (1 + self.num_mask_tokens), :]
 
         # Upscale mask embeddings and predict masks using the mask tokens
-        src = src.transpose(1, 2).view(b, c, h, w)
+        src = src.transpose(-2, -1).unflatten(-1, (h, w))
         upscaled_embedding = self.output_upscaling(src)
         hyper_in_list: List[torch.Tensor] = []
         for i in range(self.num_mask_tokens):
-            hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
-        hyper_in = torch.stack(hyper_in_list, dim=1)
-        b, c, h, w = upscaled_embedding.shape
-        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+            hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[..., i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=-2)
+        h, w = upscaled_embedding.shape[-2:]
+        masks = (hyper_in @ upscaled_embedding.flatten(-2)).unflatten(-1, (h, w))
 
         # Generate mask quality predictions
         iou_pred = self.iou_prediction_head(iou_token_out)
 
-        # TODO: No need to create NT by hand once we propagate it properly through Transformer
-        from torch.nested._internal.nested_tensor import NestedTensor
-        num_tensors = offsets.shape[0] - 1
-        masks_nt = NestedTensor(masks, offsets)
-        iou_pred_nt = NestedTensor(iou_pred, offsets)
-        return masks_nt, iou_pred_nt
+        return masks, iou_pred
 
 
 # Lightly adapted from
diff --git a/segment_anything_fast/modeling/transformer.py b/segment_anything_fast/modeling/transformer.py
@@ -79,9 +79,8 @@ def forward(
           torch.Tensor: the processed image_embedding
         """
         # BxCxHxW -> BxHWxC == B x N_image_tokens x C
-        bs, c, h, w = image_embedding.shape
-        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
-        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        image_embedding = image_embedding.flatten(-2).transpose(-1, -2)
+        image_pe = image_pe.flatten(-2).transpose(-1, -2)
 
         # Prepare queries
         queries = point_embedding
@@ -206,14 +205,12 @@ def __init__(
         self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
 
     def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
-        b, n, c = x.shape
-        x = x.reshape(b, n, num_heads, c // num_heads)
-        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+        x = x.unflatten(-1, (num_heads, -1))
+        return x.transpose(-3, -2)  # B... x N_heads x N_tokens x C_per_head
 
     def _recombine_heads(self, x: Tensor) -> Tensor:
-        b, n_heads, n_tokens, c_per_head = x.shape
-        x = x.transpose(1, 2)
-        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+        x = x.transpose(-3, -2)
+        return x.flatten(-2)
 
     def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
         # Input projections