Revert "[cutlass backend][BE][ez] Make matmul layouts be row x column (pytorch#156656)"

pytorchmergebot · pytorchmergebot · commit d3efd732348f · 2025-06-30T21:16:04.000Z
This reverts commit 84c588e. Reverted pytorch#156656 on behalf of https://github.com/henrylhtsang due to breaking fbcode A100 tests ([comment](pytorch#156656 (comment)))
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -261,7 +261,7 @@ def test_cutlass_backend_subproc_mm(self):
         M, N, K = 4096, 2048, 25728
 
         a = torch.randn(M, K).cuda().half()
-        b = torch.randn(N, K).cuda().half().t()
+        b = torch.randn(K, N).cuda().half()
 
         with config.patch(
             {
@@ -289,7 +289,7 @@ def test_cutlass_backend_subproc_addmm(self, shape_combo):
         M, N, K = 4096, 2048, 25728
 
         a = torch.randn(M, K).cuda().half()
-        b = torch.randn(N, K).cuda().half().t()
+        b = torch.randn(K, N).cuda().half()
 
         x_shapes = [
             (M, N),
@@ -326,7 +326,7 @@ def test_cutlass_backend_subproc_bmm(self):
         B, M, N, K = 10, 4096, 2048, 25728
 
         a = torch.randn(B, M, K).cuda().half()
-        b = torch.randn(B, N, K).cuda().half().permute(0, 2, 1)
+        b = torch.randn(B, K, N).cuda().half()
 
         with config.patch(
             {
@@ -358,8 +358,8 @@ def forward(self, a, b, c):
 
         model = MyModel()
         a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
-        c = torch.randn(512, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()
 
         with config.patch(
             {
@@ -400,8 +400,8 @@ def forward(self, a, b, c):
 
         model = MyModel()
         a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
-        c = torch.randn(512, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()
 
         with config.patch(
             {
@@ -465,7 +465,7 @@ def forward(self, a, b):
         model = MyModel().cuda()
 
         inputs = [
-            (torch.randn(M, K).cuda().to(dtype), torch.randn(N, K).cuda().to(dtype).t())
+            (torch.randn(M, K).cuda().to(dtype), torch.randn(K, N).cuda().to(dtype))
             for (M, N, K) in shapes
         ]
 
@@ -633,7 +633,7 @@ def forward(self, x, a, b):
                 (
                     torch.randn(x_shape(M, N)).cuda().to(dtype),
                     torch.randn(M, K).cuda().to(dtype),
-                    torch.randn(N, K).cuda().to(dtype).t(),
+                    torch.randn(K, N).cuda().to(dtype),
                 )
                 for (M, N, K) in shapes
             ]
@@ -744,7 +744,7 @@ def mm(a, b):
             return a @ b
 
         a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()
 
         with config.patch(
             {
@@ -770,7 +770,7 @@ def mm(a, b):
                 ),
             ):
                 a = torch.randn(M, K).cuda().half()
-                b = torch.randn(N, K).cuda().half().t()
+                b = torch.randn(K, N).cuda().half()
                 Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
                 Y = mm(a, b)
                 # we need relaxed numerical limits due to the sheer size of the
@@ -935,7 +935,7 @@ def forward(self, x, w):
             }
 
             x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -973,7 +973,7 @@ def forward(self, x, w):
             }
 
             x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -1003,7 +1003,7 @@ def forward(self, x, w):
             M, N, K = 200, 5216, 10_432
 
             x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -1032,7 +1032,7 @@ def mm(a, b):
         mask = torch.tensor([0, 0, 1, 1]).tile(m, k // 4).cuda().half()
         a = torch.rand(m, k).cuda().half() * mask
         a_sparse = to_sparse_semi_structured(a)
-        b = torch.rand(n, k).cuda().half().t()
+        b = torch.rand(k, n).cuda().half()
 
         with config.patch(
             {
@@ -1335,7 +1335,7 @@ def test_cutlass_presets(
 
         M, N, K = (128, 128, 16)
         A = torch.randn(M, K).cuda().half()
-        B = torch.randn(N, K).cuda().half().t()
+        B = torch.randn(K, N).cuda().half()
 
         def select_no_algorithm(*args, **kwargs):
             raise NoValidChoicesError