Fix standalone runner for CUTLASS auto-tuning backend (pytorch#146764)

alexsamardzic · pytorchmergebot · commit 5a1c7c424d01 · 2025-02-12T04:42:08.000Z
Pull Request resolved: pytorch#146764 Approved by: https://github.com/henrylhtsang ghstack dependencies: pytorch#146755
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -762,9 +762,10 @@ def mm(a, b):
                 "max_autotune_gemm_backends": "CUTLASS",
                 "cuda.cutlass_dir": _CUTLASS_DIR,
                 "cuda.cutlass_max_profiling_configs": 2,
-                "use_mixed_mm": True,
                 "autotune_local_cache": True,
                 "autotune_fallback_to_aten": False,
+                "use_mixed_mm": True,
+                "mixed_mm_choice": "aten",  # to disable Triton
             }
         ):
             Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@@ -972,6 +973,81 @@ def test_get_max_alignment(self):
             m4, 4, "Wrong max alignment. Should have been 4 (due to float32 dtype )."
         )
 
+    @unittest.skipIf(not SM80OrLater, "need sm_80")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_standalone_runner(self):
+        max_autotune_gemm_backends = "CUTLASS"
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return torch.mm(a, b.to(torch.half))
+
+        m, n, k = 128, 16, 128
+        a = torch.randn(m, k).cuda().half()
+        b = torch.randint(0, 5, (n, k), dtype=torch.int8).cuda().T
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 1,
+                "autotune_local_cache": True,
+                "autotune_fallback_to_aten": False,
+                "cuda.generate_test_runner": True,  # put standalone runner in the generated code
+                "use_mixed_mm": True,
+                "mixed_mm_choice": "aten",
+            }
+        ):
+            import os
+            from tempfile import NamedTemporaryFile
+
+            from torch._inductor.codegen.cuda.cutlass_utils import (
+                cuda_standalone_runner_compile_command,
+                CUDACompileSourceCapturingContext,
+            )
+
+            # Run compilation, check results just in case, and save
+            # CUTLASS-based generated code.
+            with CUDACompileSourceCapturingContext() as ctx:
+                compiled = torch.compile(mm, dynamic=False)
+
+                expected = mm(a, b)
+                actual = compiled(a, b)
+
+                torch.testing.assert_close(actual, expected)
+
+                sources = ctx.sources
+
+            assert len(sources) >= 1
+
+            # Get names for temporary source and executable files.
+            cu_file = NamedTemporaryFile("w", suffix=".cu", delete=False)
+            cu_file.close()
+            exe_file = NamedTemporaryFile("w", suffix="", delete=False)
+            exe_file.close()
+
+            # Save the generated code into the .cu file.
+            with open(cu_file.name, "w") as file:
+                file.write(sources[0])
+
+            # Get command to compile .cu file, and run the
+            # compilation.
+            command = cuda_standalone_runner_compile_command(
+                cu_file.name, exe_file.name
+            )
+            retcode = os.system(command)
+            assert retcode == 0
+
+            # Run the executable generated.
+            retcode = os.system(exe_file.name)
+            assert retcode == 0
+
+            # Remove temporary files.
+            os.remove(cu_file.name)
+            os.remove(exe_file.name)
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -383,10 +383,11 @@ def my_compile(source_code, dst_file_ext):
         self._compile_patch = mock.patch(
             "torch._inductor.codecache.CUDACodeCache.compile", my_compile
         )
-        return self._compile_patch.__enter__(*args, **kwargs)  # type: ignore[union-attr]
+        self._compile_patch.__enter__(*args, **kwargs)  # type: ignore[union-attr]
+        return self
 
     def __exit__(self, *args, **kwargs):
-        return self._compile_patch.__exit__(*args, **kwargs)  # type: ignore[union-attr]
+        self._compile_patch.__exit__(*args, **kwargs)  # type: ignore[union-attr]
 
 
 def cuda_standalone_runner_compile_command(srcpath: Path, exepath: Path):
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -37,7 +37,7 @@
 extern "C" {
 PT_EXPORT {{kernel_call_signature}} {
   try {
-  int64_t B = {{kernel.size(Y, 0, -3, default_value=1)}};
+  int B = {{kernel.size(Y, 0, -3, default_value=1)}};
   using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
   using coord_t = cutlass::gemm::GemmCoord::Index;
   static cutlass::KernelHardwareInfo hw_info;
@@ -154,7 +154,7 @@
 extern "C" {
 PT_EXPORT {{kernel_call_signature}} {
   try {
-  int64_t B = {{kernel.size(Y, 0, -3, default_value=1)}};
+  int B = {{kernel.size(Y, 0, -3, default_value=1)}};
   using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
   using coord_t = cutlass::gemm::GemmCoord::Index;
   static cutlass::KernelHardwareInfo hw_info;
@@ -266,8 +266,8 @@
   // Initialize GemmSparse arguments.
   arguments = {
     {
-      static_cast<coord_t>({{M}}),
-      static_cast<coord_t>({{N}}),
+      static_cast<coord_t>(M),
+      static_cast<coord_t>(N),
       static_cast<coord_t>(2 * K),
     },  // GemmCoord problem_size
     X_ref,  // TensorRef<ElementA const, LayoutA> ref_A
@@ -304,20 +304,43 @@
   if (block.size()<=0) return false;
   Element scope_max(static_cast<Element>(max)), scope_min(static_cast<Element>(min));
   cutlass::reference::device::BlockFillRandomUniform(
-    block.get(), block.size(), seed, scope_max, scope_min, 0);
+    (Element*)block.get(), block.size(), seed, scope_max, scope_min, 0);
 
   return true;
 }
 
+{% if Meta is defined and Meta is not none %}
+template <class Element>
+bool initialize_block_meta(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed) {
+  if (block.size()<=0) return false;
+  cutlass::reference::device::BlockFillRandomSparseMeta(
+    (Element*)block.get(), block.size(), seed, {{instance_type}}::kMetaSizeInBits);
+  return true;
+}
+{% endif %}
+
 extern "C" int run_standalone(uint64_t seed, int repetitions) {
     std::cout << "Starting GEMM Standalone test run with seed " << seed << std::endl;
     size_t workspace_size = 0;
     size_t* workspace_size_ptr = &workspace_size;
 
+    int M = {{kernel.get_layout_args()[0]}};
+    int N = {{kernel.get_layout_args()[1]}};
+    int K = {{kernel.get_layout_args()[2]}};
+    int lda = {{kernel.get_layout_args()[3]}};
+    int ldb = {{kernel.get_layout_args()[4]}};
+    int ldc = {{kernel.get_layout_args()[5]}};
+    int ldd = {{kernel.get_layout_args()[6]}};
+
     using ElementA = {{kernel.cutlass_dtype(X)}};
     using ElementB = {{kernel.cutlass_dtype(W)}};
     using ElementC = {{kernel.cutlass_dtype(Bias, default_dtype='uint8_t')}}; // may not be void
     using ElementD = {{kernel.cutlass_dtype(Y)}};
+    {% if Meta is defined and Meta is not none %}
+    using ElementE = {{kernel.cutlass_dtype(Meta)}};
+    {% endif %}
 
     cutlass::DeviceAllocation<ElementA> X_data({{kernel.max_valid_index(X)+1}});
     initialize_block(X_data, seed++);
@@ -326,6 +349,10 @@
     cutlass::DeviceAllocation<ElementC> Bias_data({{kernel.max_valid_index(Bias)+1}});
     initialize_block(Bias_data, seed++);
     cutlass::DeviceAllocation<ElementD> Y_data({{kernel.max_valid_index(Y)+1}});
+    {% if Meta is defined and Meta is not none %}
+    cutlass::DeviceAllocation<ElementE> Meta_data({{kernel.max_valid_index(Meta)+1}});
+    initialize_block_meta(Meta_data, seed++);
+    {% endif %}
 
     cutlass::DeviceAllocation<uint8_t> workspace_data;
     // Call once with workspace_size_ptr set to get workspace size
@@ -466,6 +493,14 @@ def _get_extra_inputs_and_names(
     ) -> tuple[Optional[Buffer], list[Optional[Buffer]], list[str]]:
         raise NotImplementedError
 
+    @abstractmethod
+    def _update_arg_names_for_test_call_statement(
+        self,
+        arg_names: list[str],
+        input_nodes: list[Buffer],
+    ) -> list[str]:
+        raise NotImplementedError
+
     def _add_cutlass_gemm_choices(
         self,
         choices: list[ChoiceCaller],
@@ -980,13 +1015,14 @@ def test_call_statement(
         """
         _, __, arg_types = kernel.args.cpp_argdefs()
         arg_names = [name.strip() for name in names_str.strip().split(",")]
-        if input_nodes[2] is None:
-            del arg_names[2]
+        arg_names = self._update_arg_names_for_test_call_statement(
+            arg_names, input_nodes
+        )
         arguments = [
             f"(({arg_type}){arg_name}_data.get())"
             for arg_type, arg_name in zip(arg_types, arg_names)
         ]
-        return f"{kernel.kernel_name}({', '.join(arguments)}, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"
+        return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, lda, ldb, ldc, ldd, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
 
 
 class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
@@ -1206,6 +1242,15 @@ def _get_extra_inputs_and_names(
         names: list[str] = []
         return (Bias, inputs, names)
 
+    def _update_arg_names_for_test_call_statement(
+        self,
+        arg_names: list[str],
+        input_nodes: list[Buffer],
+    ) -> list[str]:
+        if input_nodes[2] is None:
+            del arg_names[2]
+        return arg_names
+
     def render_gemm_arguments(
         self,
         argument_template: str,
@@ -1482,6 +1527,17 @@ def _get_extra_inputs_and_names(
         names = ["Meta"]
         return (Bias, inputs, names)
 
+    def _update_arg_names_for_test_call_statement(
+        self,
+        arg_names: list[str],
+        input_nodes: list[Buffer],
+    ) -> list[str]:
+        if input_nodes[3] is None:
+            del arg_names[3]
+        if input_nodes[2] is None:
+            del arg_names[2]
+        return arg_names
+
     def render_gemm_arguments(
         self,
         instance_type: str,