link

titaiwangms · titaiwangms · commit 34f5ed6a3be1 · 2025-07-23T21:22:15.000Z
diff --git a/onnx_diagnostic/tasks/mask_generation.py b/onnx_diagnostic/tasks/mask_generation.py
@@ -1,7 +1,6 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
-from ..helpers.cache_helper import make_dynamic_cache
-from ..helpers.config_helper import update_config, check_hasattr, _pick
+from ..helpers.config_helper import update_config, check_hasattr
 
 __TASK__ = "mask-generation"
 
@@ -31,7 +30,7 @@ def get_inputs(
 ):
     """
     Generates input for task ``mask-generation``.
-    
+
     :param model: model to get the missing information
     :param config: configuration used to generate the model
     :param batch_size: batch size
@@ -46,10 +45,10 @@ def get_inputs(
     assert (
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
-    
 
-    # TODO(anyone): input_masks is weridly failing all the time with mismatch channels with Conv
-    # or embedding_size. I guess maybe the model is too implicit on the input_masks shape.
+    # TODO(anyone): input_masks is weridly failing all the time with mismatch channels
+    # with Conv or embedding_size. I guess maybe the model is too implicit on the
+    # input_masks shape.
 
     shapes = {
         "pixel_values": {0: "batch", 2: "height", 3: "width"},  # 1: num_channels is static
@@ -64,9 +63,7 @@ def get_inputs(
         input_points=torch.randn(
             (batch_size, 1, 10, 2), dtype=torch.float32
         ),  # 10 points per image
-        input_boxes=torch.randn(
-            (batch_size, 1, 4), dtype=torch.float32
-        ),  # 1 box per image
+        input_boxes=torch.randn((batch_size, 1, 4), dtype=torch.float32),  # 1 box per image
         # input_masks=torch.randn(
         #     (batch_size, 1, height, width), dtype=torch.float32
         # ),  # mask for the image
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1184,6 +1184,7 @@ def forward(
             return attn_output, attn_weights, past_key_value
         return attn_output, attn_weights
 
+
 class patched_SamMaskDecoder(torch.nn.Module):
     _PATCHES_ = ["forward"]
     _PATCHED_CLASS_ = transformers.models.sam.modeling_sam.SamMaskDecoder
@@ -1223,10 +1224,11 @@ def forward(
         output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1)
 
         # torch.cond rewrites the if-else logic to handle empty sparse_prompt_embeddings
-        # torch.any is needed to avoid data-dependent control flow 
+        # torch.any is needed to avoid data-dependent control flow
         # with sparse_prompt_embeddings.sum().item() != 0
         def sparse_prompt_embeddings_is_not_empty(output_tokens, sparse_prompt_embeddings):
             return torch.cat((output_tokens, sparse_prompt_embeddings), dim=2)
+
         def sparse_prompt_embeddings_is_empty(output_tokens, sparse_prompt_embeddings):
             return output_tokens.clone()
 
@@ -1242,7 +1244,9 @@ def sparse_prompt_embeddings_is_empty(output_tokens, sparse_prompt_embeddings):
         # Expand per-image data in batch direction to be per-point
         image_embeddings = image_embeddings + dense_prompt_embeddings
         image_embeddings = image_embeddings.repeat_interleave(point_batch_size, 0)
-        image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0)
+        image_positional_embeddings = image_positional_embeddings.repeat_interleave(
+            point_batch_size, 0
+        )
 
         # Run the transformer, image_positional_embedding are consumed
         point_embedding, image_embeddings, attentions = self.transformer(
@@ -1272,8 +1276,12 @@ def sparse_prompt_embeddings_is_empty(output_tokens, sparse_prompt_embeddings):
         hyper_in = torch.stack(hyper_in_list, dim=2)
 
         _, num_channels, height, width = upscaled_embedding.shape
-        upscaled_embedding = upscaled_embedding.reshape(batch_size, point_batch_size, num_channels, height * width)
-        masks = (hyper_in @ upscaled_embedding).reshape(batch_size, point_batch_size, -1, height, width)
+        upscaled_embedding = upscaled_embedding.reshape(
+            batch_size, point_batch_size, num_channels, height * width
+        )
+        masks = (hyper_in @ upscaled_embedding).reshape(
+            batch_size, point_batch_size, -1, height, width
+        )
 
         # Generate mask quality predictions
         iou_pred = self.iou_prediction_head(iou_token_out)
@@ -1289,8 +1297,8 @@ def sparse_prompt_embeddings_is_empty(output_tokens, sparse_prompt_embeddings):
         outputs = (masks, iou_pred)
 
         if output_attentions:
-            outputs = outputs + (attentions,)
+            outputs = outputs + (attentions,)  # noqa: RUF005
         else:
-            outputs = outputs + (None,)
+            outputs = outputs + (None,)  # noqa: RUF005
 
-        return outputs
+        return outputs