[cutlass backend] try fix standlone runner test (pytorch#147811)

henrylhtsang · pytorchmergebot · commit 0633f63f0d7d · 2025-02-25T23:27:02.000Z
Differential Revision: [D70147859](https://our.internmc.facebook.com/intern/diff/D70147859/) Trying to fix this test one last time, especially when mixed mm is getting removed. Pull Request resolved: pytorch#147811 Approved by: https://github.com/chenyang78
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -34,7 +34,9 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater, SM90OrLater
 from torch.testing._internal.common_utils import (
+    IN_RE_WORKER,
     instantiate_parametrized_tests,
+    IS_FBCODE,
     parametrize,
 )
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
@@ -1030,29 +1032,21 @@ def test_get_max_alignment(self):
             m4, 4, "Wrong max alignment. Should have been 4 (due to float32 dtype )."
         )
 
-    @unittest.skipIf(not SM80OrLater, "need sm_80")
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_standalone_runner(self):
         max_autotune_gemm_backends = "CUTLASS"
 
-        def mm(a, b):
-            return torch.mm(a, b.to(torch.half))
-
-        m, n, k = 128, 16, 128
-        a = torch.randn(m, k).cuda().half()
-        b = torch.randint(0, 5, (n, k), dtype=torch.int8).cuda().T
+        a = torch.randn(128, 16).cuda().half()
+        b = torch.randn(16, 128).cuda().half()
 
         with config.patch(
             {
                 "max_autotune": True,
-                "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 1,
-                "autotune_local_cache": True,
+                "cuda.cutlass_max_profiling_configs": 2,
                 "autotune_fallback_to_aten": False,
                 "cuda.generate_test_runner": True,  # put standalone runner in the generated code
-                "use_mixed_mm": True,
-                "mixed_mm_choice": "aten",
             }
         ):
             from tempfile import NamedTemporaryFile
@@ -1065,9 +1059,9 @@ def mm(a, b):
             # Run compilation, check results just in case, and save
             # CUTLASS-based generated code.
             with CUDACompileSourceCapturingContext() as ctx:
-                compiled = torch.compile(mm, dynamic=False)
+                compiled = torch.compile(torch.mm, dynamic=False)
 
-                expected = mm(a, b)
+                expected = torch.mm(a, b)
                 actual = compiled(a, b)
 
                 torch.testing.assert_close(actual, expected)
@@ -1092,14 +1086,12 @@ def mm(a, b):
                 Path(cu_file.name), Path(exe_file.name)
             )
 
-            if config.is_fbcode():
+            if IS_FBCODE:
                 # hack to bypass the following error:
                 # error while loading shared libraries: IX}: invalid mode for dlopen(): Invalid argument
                 platform_path = sysconfig.get_config_var("LIBDIR")
-                link_str = " ".join(
-                    [f"-L{platform_path}", "-Xlinker", f"-rpath={platform_path}"]
-                )
-                command = command.replace(link_str, " ")
+                cuda_path = os.path.realpath(os.path.join(platform_path, "libcuda.so"))
+                command = command.replace("-lcuda ", f"-L{cuda_path} ")
 
             repro_message = (
                 f"Reproduce with: {command}\n"
@@ -1108,11 +1100,12 @@ def mm(a, b):
             )
 
             retcode = os.system(command)
-            assert retcode == 0, repro_message
+            self.assertEqual(retcode, 0, repro_message)
 
             # Run the executable generated.
-            retcode = os.system(exe_file.name)
-            assert retcode == 0, repro_message
+            if not IS_FBCODE or not IN_RE_WORKER:
+                retcode = os.system(exe_file.name)
+                self.assertEqual(retcode, 0, repro_message)
 
             # Remove temporary files.
             os.remove(cu_file.name)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -235,6 +235,7 @@ def repro_env_var_prefix() -> str:
     implied_by_fn=lambda: os.getenv("TW_JOB_USER") == "sandcastle",
     include_in_repro=False,
 )
+IN_RE_WORKER: bool = os.environ.get("INSIDE_RE_WORKER") is not None
 
 _is_fbcode_default = (
     hasattr(torch._utils_internal, "IS_FBSOURCE") and

Original file line number	Diff line number	Diff line change
`@@ -235,6 +235,7 @@ def repro_env_var_prefix() -> str:`
`235`	`235`	`implied_by_fn=lambda: os.getenv("TW_JOB_USER") == "sandcastle",`
`236`	`236`	`include_in_repro=False,`
`237`	`237`	`)`
	`238`	`+IN_RE_WORKER: bool = os.environ.get("INSIDE_RE_WORKER") is not None`
`238`	`239`
`239`	`240`	`_is_fbcode_default = (`
`240`	`241`	`hasattr(torch._utils_internal, "IS_FBSOURCE") and`