Revert "Added swizzle searching, disabled fp16 accum, and enabled ping-pong for cutlass (pytorch#144829)"

pytorchmergebot · pytorchmergebot · commit ce4a097bf769 · 2025-01-23T19:37:54.000Z
This reverts commit 5508444. Reverted pytorch#144829 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](pytorch#144829 (comment)))
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -245,7 +245,7 @@ def get_accumulator_dtype(
             return torch_dtype
         else:
             return torch.float
-    if torch_dtype in (torch.float16, torch.bfloat16, torch.float):
+    if torch_dtype in (torch.bfloat16, torch.float):
         return torch.float
     if torch_dtype == torch.int8:
         return torch.int32
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -46,7 +46,7 @@
     CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count);
   }
   {{instance_type}}::Arguments arguments;
-  {{template.render_gemm_arguments(argument_template, epilogue_template, should_swap_xw, swizzle,
+  {{template.render_gemm_arguments(argument_template, epilogue_template, should_swap_xw,
                                     X, W, Bias, Y, alpha, beta, kernel, epilogue_args)}}
   {{instance_type}} gemm_op;
   if (workspace_size) {
@@ -118,7 +118,6 @@
     {{epilogue_arguments}},
     hw_info
   };
-  arguments.scheduler.max_swizzle_size = {{swizzle}};
 """
 
 # Jinja template for Cutlass 3.x GEMM Kernel arguments if epilogue fusion is applied,
@@ -502,11 +501,11 @@ def _add_cutlass_gemm_choices(
 
         ops = self.gen_ops()
         for name, op in ops:
-            for swizzle in (1, 2, 4, 8):
-                description = f"{name} swizzle={swizzle}"
-                self.maybe_append_choice(
-                    choices, description=description, op=op, swizzle=swizzle
-                )
+            self.maybe_append_choice(
+                choices,
+                description=name,
+                op=op,
+            )
         if len(ops) == 0:
             input_layouts = [node.get_layout() for node in input_nodes]
             input_strides = [node.get_stride() for node in input_nodes]
@@ -953,7 +952,6 @@ def render(  # type: ignore[override]
             Bias=Bias,
             epilogue_template=epilogue_template,
             argument_template=argument_template,
-            swizzle=kwargs["swizzle"],
             should_swap_xw=should_swap_xw,
             template=self,
             kernel=kernel,
@@ -1218,7 +1216,6 @@ def render_gemm_arguments(
         argument_template: str,
         epilogue_template: str,
         should_swap_xw: bool,
-        swizzle: int,
         X: IRNode,
         W: IRNode,
         Bias: IRNode,
@@ -1264,7 +1261,6 @@ def render_gemm_arguments(
             M="M",
             N="N",
             epilogue_args=epilogue_args,
-            swizzle=swizzle,
         )
         assert epilogue_template is not None
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1253,7 +1253,7 @@ class cuda:
     # Set this to "pingpong" to avoid numerical issues
     # caused by the op ordering of the "pingpong" memory access
     # pattern used by some Cutlass Kernels.
-    cutlass_op_denylist_regex: Optional[str] = None
+    cutlass_op_denylist_regex: Optional[str] = "pingpong"
 
 
 class rocm: