Add stride check for attn_mask on non-cpu device (pytorch#158618)

CaoE · web-flow · commit 9bd202e51425 · 2025-07-18T13:11:55.000-04:00
Add stride check for attn_mask on non-cpu device (pytorch#158424) Fixes pytorch#158374 Pull Request resolved: pytorch#158424 Approved by: https://github.com/Valentine233, https://github.com/drisspg, https://github.com/atalman
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -6,6 +6,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/DeviceType.h>
 #include <c10/core/ScalarType.h>
 
 #include <c10/util/Exception.h>
@@ -503,17 +504,27 @@ inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool
   if (ignore_singleton_dim){
     qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1;
   }
-  if (!qkv_strides_equal_1) {
+  bool is_cpu = params.query.device().type() == c10::DeviceType::CPU;
+  bool mask_stride_equal_1 = params.attn_mask.has_value()
+      ? params.attn_mask.value().sym_stride(-1) == 1
+      : true;
+  bool mask_stride_valid = is_cpu ? true : mask_stride_equal_1;
+  if (!(qkv_strides_equal_1 && mask_stride_valid)) {
     if (debug) {
-      TORCH_WARN(
-          "All fused kernels require the last dimension of the input to have stride 1. ",
-          "Got Query.stride(-1): ",
-          params.query.sym_stride(-1),
-          ", Key.stride(-1): ",
-          params.key.sym_stride(-1),
-          ", Value.stride(-1): ",
-          params.value.sym_stride(-1),
-          " instead.");
+      std::ostringstream message;
+      message
+          << "All fused kernels require the last dimension of the input to have stride 1. ";
+      message << "Got Query.stride(-1): " << params.query.sym_stride(-1)
+              << ", Key.stride(-1): " << params.key.sym_stride(-1)
+              << ", Value.stride(-1): " << params.value.sym_stride(-1);
+
+      if (params.attn_mask.has_value()) {
+        message
+            << ", Attn_mask.stride(-1): "
+            << params.attn_mask.value().sym_stride(-1)
+            << " (GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not).";
+      }
+      TORCH_WARN(message.str());
     }
 
     return false;
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
@@ -1023,7 +1023,7 @@ def dot_prod_attention(
             return attn_weights.matmul(value), key, value
 
         tensor_shape = (4, 2, 16, 32)
-        attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
+        attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
         args = [
             torch.randn(tensor_shape, device=self.device),
             torch.randn(tensor_shape, device=self.device),
@@ -1036,6 +1036,16 @@ def dot_prod_attention(
             has_dropout=False,
             check_train=False,
         )
+        # test attn_mask with stride of last dim != 1
+        attn_mask_ = attn_mask.transpose(2, 3)
+        args[3] = attn_mask_
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+            contains=self.device == "cpu",
+        )
 
     def _test_sdpa_rewriter_23(self):
         def dot_prod_attention(
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -1618,6 +1618,34 @@ def test_invalid_last_dim_stride(self, device, kernel: SDPBackend):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False))
 
+    @onlyCUDA
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
+        or not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+        "Efficient or cuDNN Attention was not built for this system",
+    )
+    @parametrize("kernel", [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.CUDNN_ATTENTION])
+    def test_mask_invalid_last_dim_stride(self, device, kernel):
+        with sdpa_kernel(backends=[kernel]):
+            dtype = torch.float16
+            make_tensor = partial(torch.rand, device=device, dtype=dtype)
+            size = SdpaShape(2, 2, 8, 8)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+            attn_mask = make_tensor((2, 2, 8, 8))
+            # Passing in a attn_mask with last dim stride not equal to 1 will error
+            attn_mask.as_strided_(size, [2, 2, 2, 2])
+
+            with self.assertWarnsRegex(
+                UserWarning,
+                "GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not",
+            ):
+                self.assertRaises(
+                    RuntimeError,
+                    lambda: torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, attn_mask, 0.0, False
+                    ),
+                )
+
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
     @parametrize("fused_kernel", [SDPBackend.EFFICIENT_ATTENTION])