Revert "Update low prec codegen for div/mod (pytorch#142350)"

pytorchmergebot · pytorchmergebot · commit 54ed13cdce08 · 2024-12-16T20:05:14.000Z
This reverts commit ca97306. Reverted pytorch#142350 on behalf of https://github.com/huydhn due to Sorry for reverting your change but I think it. breaks an internal test ([comment](pytorch#142350 (comment)))
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
@@ -212,18 +212,6 @@ def test_binary_math_mixed_precision(self):
         # There should be no downcast, since the input is promoted to float32.
         self.assertNotIn(".to(tl.float16)", code)
 
-    @config.patch("test_configs.runtime_triton_dtype_assert", True)
-    @config.patch("triton.codegen_upcast_to_fp32", False)
-    def test_downcast_div_mod(self):
-        def fn(x, y):
-            return x % y, x / y
-
-        x, y = (torch.rand([8], dtype=torch.float16, device="cuda") for _ in range(2))
-
-        out, code = run_and_get_code(torch.compile(fn), x, y)
-        FileCheck().check("static_assert").check_same(".dtype").run(code[0])
-        self.assertEqual(fn(x, y), out)
-
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_constant(self):
         def fn():
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
@@ -175,7 +175,6 @@ def fn2(a, b, c):
     @skipIfXpu
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(force_fuse_int_mm_with_mul=True)
-    @inductor_config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_fused_int_mm_mul_epilogue(self):
         def fn1(a, b, c):
             return (
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -104,8 +104,6 @@
 if TYPE_CHECKING:
     from types import ModuleType
 
-    from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
-
     from ..ir import IRNode
 
 log = logging.getLogger(__name__)
@@ -743,12 +741,6 @@ def update_on_args(self, name, args, kwargs):
                         break
 
 
-def get_dtype_handler() -> DtypePropagationOpsHandler:
-    from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
-
-    return DtypePropagationOpsHandler()
-
-
 def maybe_upcast_float32(convert_output: bool = True):
     """
     Codegen helper to upcast arguments to float32, depending on the config and dtype.
@@ -775,17 +767,27 @@ def wrapped(*args, **kwargs) -> str:
             upcast_args = [maybe_upcast_arg(arg) for arg in args]
             upcast_kwargs = {key: maybe_upcast_arg(val) for key, val in kwargs.items()}
 
+            # Infer the output dtype from the inputs.
+            # This promotes to the largest input type.
+            all_args = args + tuple(kwargs.values())
+            input_dtypes = [
+                var.dtype
+                for var in all_args
+                if isinstance(var, CSEVariable) and var.dtype is not None
+            ]
+            result_dtype = (
+                functools.reduce(torch.promote_types, input_dtypes)
+                if len(input_dtypes) > 0
+                else None
+            )
+
             # Call the decorated function, optionally downcasting the result.
             result = func(*upcast_args, **upcast_kwargs)
-            any_needs_upcast = convert_output and any(
-                needs_upcast(var) for var in itertools.chain(args, kwargs.values())
+            needs_downcast = (
+                convert_output
+                and any(needs_upcast(var) for var in all_args)
+                and result_dtype not in (torch.float32, None)
             )
-            result_dtype = (
-                None
-                if not any_needs_upcast
-                else getattr(get_dtype_handler(), func.__name__)(*args, **kwargs)
-            )
-            needs_downcast = result_dtype not in (torch.float32, None)
             downcast_string = (
                 f".to({triton_type(result_dtype)})"
                 if needs_downcast and result_dtype is not None
@@ -908,25 +910,6 @@ def constant(cls, value, dtype):
     def abs(x):
         return f"tl_math.abs({x})"
 
-    # TODO - register these ops as having divergent dtype
-    # output if doing graph pass to remove consecutive casts
-
-    @staticmethod
-    def truediv(x, y):
-        out = f"({x} / {y})"
-        out_dtype = get_dtype_handler().truediv(x, y)
-        if out_dtype in (torch.float16, torch.float32):
-            out = f"{out}.to({triton_type(out_dtype)})"
-        return out
-
-    @staticmethod
-    def mod(x, y):
-        out = f"({x} % {y})"
-        out_dtype = get_dtype_handler().mod(x, y)
-        if out_dtype in (torch.float16, torch.float32):
-            out = f"{out}.to({triton_type(out_dtype)})"
-        return out
-
     @staticmethod
     @maybe_upcast_float32()
     def libdevice_abs(x):