Update low prec codegen for div/mod (pytorch#142350)

eellison · pytorchmergebot · commit 135a2d44830b · 2024-12-16T21:46:08.000Z
Div/mod in fp16/bf16 requires a downcast to preserve its inputs' dtypes. Pull Request resolved: pytorch#142350 Approved by: https://github.com/blaine-rister
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
@@ -212,6 +212,18 @@ def test_binary_math_mixed_precision(self):
         # There should be no downcast, since the input is promoted to float32.
         self.assertNotIn(".to(tl.float16)", code)
 
+    @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("triton.codegen_upcast_to_fp32", False)
+    def test_downcast_div_mod(self):
+        def fn(x, y):
+            return x % y, x / y
+
+        x, y = (torch.rand([8], dtype=torch.float16, device="cuda") for _ in range(2))
+
+        out, code = run_and_get_code(torch.compile(fn), x, y)
+        FileCheck().check("static_assert").check_same(".dtype").run(code[0])
+        self.assertEqual(fn(x, y), out)
+
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_constant(self):
         def fn():
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
@@ -175,6 +175,7 @@ def fn2(a, b, c):
     @skipIfXpu
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(force_fuse_int_mm_with_mul=True)
+    @inductor_config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_fused_int_mm_mul_epilogue(self):
         def fn1(a, b, c):
             return (
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -104,6 +104,8 @@
 if TYPE_CHECKING:
     from types import ModuleType
 
+    from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+
     from ..ir import IRNode
 
 log = logging.getLogger(__name__)
@@ -741,6 +743,12 @@ def update_on_args(self, name, args, kwargs):
                         break
 
 
+def get_dtype_handler() -> DtypePropagationOpsHandler:
+    from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+
+    return DtypePropagationOpsHandler()
+
+
 def maybe_upcast_float32(convert_output: bool = True):
     """
     Codegen helper to upcast arguments to float32, depending on the config and dtype.
@@ -767,27 +775,17 @@ def wrapped(*args, **kwargs) -> str:
             upcast_args = [maybe_upcast_arg(arg) for arg in args]
             upcast_kwargs = {key: maybe_upcast_arg(val) for key, val in kwargs.items()}
 
-            # Infer the output dtype from the inputs.
-            # This promotes to the largest input type.
-            all_args = args + tuple(kwargs.values())
-            input_dtypes = [
-                var.dtype
-                for var in all_args
-                if isinstance(var, CSEVariable) and var.dtype is not None
-            ]
-            result_dtype = (
-                functools.reduce(torch.promote_types, input_dtypes)
-                if len(input_dtypes) > 0
-                else None
-            )
-
             # Call the decorated function, optionally downcasting the result.
             result = func(*upcast_args, **upcast_kwargs)
-            needs_downcast = (
-                convert_output
-                and any(needs_upcast(var) for var in all_args)
-                and result_dtype not in (torch.float32, None)
+            any_needs_upcast = convert_output and any(
+                needs_upcast(var) for var in itertools.chain(args, kwargs.values())
             )
+            result_dtype = (
+                None
+                if not any_needs_upcast
+                else getattr(get_dtype_handler(), func.__name__)(*args, **kwargs)
+            )
+            needs_downcast = result_dtype not in (torch.float32, None)
             downcast_string = (
                 f".to({triton_type(result_dtype)})"
                 if needs_downcast and result_dtype is not None
@@ -910,6 +908,25 @@ def constant(cls, value, dtype):
     def abs(x):
         return f"tl_math.abs({x})"
 
+    # TODO - register these ops as having divergent dtype
+    # output if doing graph pass to remove consecutive casts
+
+    @staticmethod
+    def truediv(x, y):
+        out = f"({x} / {y})"
+        out_dtype = get_dtype_handler().truediv(x, y)
+        if out_dtype in (torch.float16, torch.float32):
+            out = f"{out}.to({triton_type(out_dtype)})"
+        return out
+
+    @staticmethod
+    def mod(x, y):
+        out = f"({x} % {y})"
+        out_dtype = get_dtype_handler().mod(x, y)
+        if out_dtype in (torch.float16, torch.float32):
+            out = f"{out}.to({triton_type(out_dtype)})"
+        return out
+
     @staticmethod
     @maybe_upcast_float32()
     def libdevice_abs(x):