Fix assertion failure in gemm template lowering (pytorch#146353)

dmpots · pytorchmergebot · commit 9c78fb920d51 · 2025-02-08T01:52:20.000Z
Summary: This commit fixes a crash in the gemm template lowering caused by hitting an [assert](https://github.com/pytorch/pytorch/blob/fd515e4f59bfa0ac9faa5185b7a02f3222c4cd08/torch/_inductor/codegen/common.py#L1181) that a buffer was previously removed. The assert triggers because in the first gemm lowering we use a local accumulation buffer, which causes the original buffer name to be added to the `removed_buffers` set. Then in the next gemm lowering we use the global buffer for accumulation, but that buffer name is already in the `removed_buffers` set. The fix is to add a unique suffix to the buffer name to avoid triggering the assert from different gemm lowerings. Differential Revision: D68814625 Pull Request resolved: pytorch#146353 Approved by: https://github.com/leslie-fang-intel, https://github.com/frost-intel, https://github.com/hl475
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
@@ -2299,6 +2299,63 @@ def forward(self, x, w):
             self.assertEqual(actual, expected, atol=atol, rtol=rtol)
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 2)
 
+    @patches
+    @torch.no_grad
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    @set_num_threads(1)  # avoid k_slicing to make the test deterministic
+    @parametrize(
+        "out_features1",
+        (
+            8,
+            16,
+            24,
+            32,
+            48,
+        ),
+    )
+    @dtypes(torch.float)
+    def test_local_and_global_accumulator(self, out_features1, dtype):
+        batch_size = 256
+        in_features = 64
+        out_features = 129
+        in_features1 = 128
+        bias = True
+        try:
+            try:
+                from . import test_aot_inductor_utils
+            except ImportError:
+                import test_aot_inductor_utils
+        except Exception:
+            # skip this UT if import failed
+            return
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+                self.linear = torch.nn.Linear(in_features, out_features, bias)
+                self.linear1 = torch.nn.Linear(in_features1, out_features1, bias)
+
+            def forward(self, x):
+                y = self.linear(x)
+                view = torch.ops.aten.view.default(y, [-1, in_features1])
+                return self.linear1(view)
+
+        counters.clear()
+        x = torch.randn(batch_size, in_features).to(dtype=dtype)
+        mod = M().to(dtype=dtype).eval()
+        with verify(dtype) as (atol, rtol), torch.no_grad():
+            expected = mod(
+                x,
+            )
+            actual = test_aot_inductor_utils.AOTIRunnerUtil.run(
+                "cpu",
+                mod,
+                (x,),
+            )
+            self.assertEqual(actual, expected, atol=atol, rtol=rtol)
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 2)
+
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class _DynamicShapesTestBase(BaseTestSelectAlgorithm):
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
@@ -34,8 +34,9 @@ def __init__(
     ) -> None:
         super().__init__(name)
         self.input_nodes = input_nodes
+        self.index = next(self.index_counter)
         self.output_node: Union[ir.Buffer, list[ir.Buffer]] = ir.Buffer(
-            name="buf_out", layout=layout
+            name=f"buf_out{self.index}", layout=layout
         )
         self.layout = layout
         self.num_threads = num_threads
@@ -75,7 +76,7 @@ def generate(self, **kwargs):
         # since in cpp kernel, we bind it to C long
         extra_args = tuple(ctypes.c_ulonglong(x) for x in extra_args)
 
-        kernel_hash_name = f"cpp_{self.name}_{next(self.index_counter)}"
+        kernel_hash_name = f"cpp_{self.name}_{self.index}"
 
         # Create the BenchmarkRequest for CPP
         bmreq = CppBenchmarkRequest(