Fix CUTLASS 2.x kernels for auto-tuning (pytorch#146755)

alexsamardzic · pytorchmergebot · commit eb655a2d5fac · 2025-02-12T04:42:07.000Z
Pull Request resolved: pytorch#146755 Approved by: https://github.com/henrylhtsang
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -45,7 +45,6 @@
 HAS_CUDA = HAS_CUDA and not torch.version.hip
 SM80OrLater = SM80OrLater and not torch.version.hip
 SM90OrLater = SM90OrLater and not torch.version.hip
-SM80 = SM80OrLater and torch.cuda.get_device_capability() == (8, 0)
 
 
 def _get_path_without_sccache() -> str:
@@ -737,22 +736,14 @@ def forward(self, x, w):
             torch.testing.assert_close(expected, actual, atol=0.01, rtol=0.01)
 
     # TODO: Enable dynamic test cases when dynamic support is added.
-    @unittest.skipIf(True, "disabled due to broken on A100")
-    # error: TypeError: can't multiply sequence by non-int of type 'str'
-    @unittest.skipIf(not SM80, "need sm_80 exactly")
+    @unittest.skipIf(not SM80OrLater or SM90OrLater, "need sm_8x exactly")
     @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,Triton,ATen"))
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_max_autotune_cutlass_backend_mixed_mm(
-        self, dynamic: bool, max_autotune_gemm_backends: str
-    ):
+    def test_max_autotune_cutlass_backend_mixed_mm(self, dynamic: bool):
         """
         Make sure autotuning mm in sub processes work without crashes.
         """
 
-        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
-            return
-
         def mm(a, b):
             return torch.mm(a, b.to(torch.half))
 
@@ -768,7 +759,7 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "autotune_in_subproc": True,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "max_autotune_gemm_backends": "CUTLASS",
                 "cuda.cutlass_dir": _CUTLASS_DIR,
                 "cuda.cutlass_max_profiling_configs": 2,
                 "use_mixed_mm": True,
@@ -792,22 +783,16 @@ def mm(a, b):
         assert cutlass_kernels_count > 0
 
     # TODO: Enable dynamic test cases when dynamic support is added.
-    @unittest.skipIf(True, "disabled due to broken on A100")
-    # error: TypeError: can't multiply sequence by non-int of type 'str'
-    @unittest.skipIf(not SM80, "need sm_80 exactly")
+    @unittest.skipIf(not SM80OrLater or SM90OrLater, "need sm_8x exactly")
     @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,Triton,ATen"))
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_sparse_semi_structured_mm(
-        self, dynamic: bool, max_autotune_gemm_backends: str
+        self, dynamic: bool
     ):
         """
         Make sure autotuning mm in sub processes work without crashes.
         """
 
-        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
-            return
-
         SparseSemiStructuredTensor._FORCE_CUTLASS = True
 
         def mm(a, b):
@@ -823,7 +808,7 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "autotune_in_subproc": True,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "max_autotune_gemm_backends": "CUTLASS",
                 "cuda.cutlass_dir": _CUTLASS_DIR,
                 "cuda.cutlass_max_profiling_configs": 2,
                 "autotune_local_cache": True,
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
 
-from sympy import Expr
+from sympy import Expr, symbols
 
 from torch import dtype as torch_dtype
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
@@ -404,6 +404,7 @@ def size(
         if len(sizes) == 0:
             return str(default_value)
 
+        sizes = [symbols(v) if isinstance(v, str) else v for v in sizes]
         val = sympy_product(sizes)
         return val
 
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -155,9 +155,6 @@
 PT_EXPORT {{kernel_call_signature}} {
   try {
   int64_t B = {{kernel.size(Y, 0, -3, default_value=1)}};
-  int64_t M = {{kernel.size(X, -2)}};
-  int64_t K = {{kernel.size(W, -2)}};
-  int64_t N = {{kernel.size(W, -1)}};
   using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
   using coord_t = cutlass::gemm::GemmCoord::Index;
   static cutlass::KernelHardwareInfo hw_info;
@@ -176,13 +173,6 @@
 
   // check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers
 #ifndef CUTLASS_BACKEND_DISABLE_CHECKS
-  {{kernel.check_not_null(X)}}
-  {{kernel.check_not_null(W)}}
-  {{kernel.check_not_null(Bias)}}
-  {{kernel.check_not_null(Meta)}}
-  {{kernel.check_not_null(Y)}}
-
-
   {
     auto status = gemm_op.can_implement(arguments);
     CUTLASS_CHECK(status);
@@ -278,7 +268,7 @@
     {
       static_cast<coord_t>({{M}}),
       static_cast<coord_t>({{N}}),
-      static_cast<coord_t>(K),
+      static_cast<coord_t>(2 * K),
     },  // GemmCoord problem_size
     X_ref,  // TensorRef<ElementA const, LayoutA> ref_A
     W_ref,  // TensorRef<ElementB const, LayoutB> ref_B
@@ -1382,7 +1372,7 @@ def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
         A_size = [int(i) for i in A_layout.size]
         B_size = [int(i) for i in B_layout.size]
         K = max(A_size[1], B_size[0])
-        return (K == A_size[1] or K == 2 * A_size[0]) and K == B_size[0]
+        return (K == A_size[1] or K == 2 * A_size[1]) and K == B_size[0]
 
     def _shape_match(
         self,