[Inductor] No longer throw error in bmm out_dtype lowering due to tem… (pytorch#166922)

Lucaskabela · PaulZhang12 · web-flow · commit e6bcbbe17c64 · 2025-11-07T11:30:59.000-05:00
[Inductor] No longer throw error in bmm out_dtype lowering due to template heuristics (pytorch#166457) Fixes pytorch#165892 Pull Request resolved: pytorch#166457 Approved by: https://github.com/coconutruben (cherry picked from commit c2e3cc7) Co-authored-by: PaulZhang12 <paulzhan@fb.com>
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -1479,6 +1479,29 @@ def mm_transpose_relu(a, b):
             # Check that contiguous transform was used
             FileCheck().check("contiguous_mm").run(code[0])
 
+    @unittest.skipIf(config.cpp_wrapper, "out_dtype override not supported for AOTI")
+    @unittest.skipIf(TEST_WITH_ROCM, "out_dtype override only available on NVIDIA")
+    def test_bmm_out_dtype(self):
+        def f(a, b):
+            return torch.bmm(a, b, out_dtype=torch.float32)
+
+        a = torch.randn(2, 3, 4, device=GPU_TYPE, dtype=torch.float16)
+        b = torch.randn(2, 4, 5, device=GPU_TYPE, dtype=torch.float16)
+        with config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="TRITON",
+        ):
+            compiled_f = torch.compile(f)
+            with self.assertRaisesRegex(
+                torch._inductor.exc.InductorError,
+                r"LoweringException: NoValidChoicesError: No choices to select",
+            ):
+                out, code = run_and_get_code(compiled_f, a, b)
+
+        compiled_f = torch.compile(f)
+        out, code = run_and_get_code(compiled_f, a, b)
+        FileCheck().check("extern_kernels.bmm_dtype").run(code[0])
+
     def test_triton_template_generated_code_cache_key(self):
         generate_and_load_args = len(
             inspect.signature(
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
@@ -208,9 +208,10 @@ def may_require_contiguous(t, meta_t):
             )
         )
 
-    if use_triton_template(layout, check_max_autotune=False):
+    if use_triton_template(layout, check_max_autotune=False) and (
+        out_dtype is None or out_dtype == mat1.get_dtype()
+    ):
         # TODO: add out_dtype support for Triton Template
-        assert out_dtype is None, "out_dtype is not supported for Triton"
 
         choices.extend(
             V.choices.get_mm_configs(kernel_inputs, layout, [bmm_template], name)

Original file line number	Diff line number	Diff line change
`@@ -208,9 +208,10 @@ def may_require_contiguous(t, meta_t):`
`208`	`208`	`)`
`209`	`209`	`)`
`210`	`210`
`211`		`- if use_triton_template(layout, check_max_autotune=False):`
	`211`	`+ if use_triton_template(layout, check_max_autotune=False) and (`
	`212`	`+ out_dtype is None or out_dtype == mat1.get_dtype()`
	`213`	`+ ):`
`212`	`214`	`# TODO: add out_dtype support for Triton Template`
`213`		`- assert out_dtype is None, "out_dtype is not supported for Triton"`
`214`	`215`
`215`	`216`	`choices.extend(`
`216`	`217`	`V.choices.get_mm_configs(kernel_inputs, layout, [bmm_template], name)`