Add fallbacks for unary ops that don't support fp16 (#361)

jansel · web-flow · commit 48087878d682 · 2025-07-23T22:05:32.000-07:00
diff --git a/helion/_compiler/inductor_lowering_extra.py b/helion/_compiler/inductor_lowering_extra.py
@@ -7,12 +7,54 @@
 from typing import Generator
 
 import torch
+from torch._inductor.ir import TensorBox
+from torch._inductor.lowering import lowerings as original_lowerings
 from torch._inductor.lowering import to_dtype
 from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
 
 inductor_lowering_dispatch: dict[Callable[..., Any] | str, Callable[..., Any]] = {}
 
 
+def create_fp16_to_fp32_unary_fallback_lowering(
+    original_op: Callable[..., object],
+) -> Callable[..., object]:
+    """Create a lowering that converts fp16/bfloat16 inputs to fp32 before calling the operation."""
+
+    @functools.wraps(original_op)
+    def fp32_fallback_lowering(x: object) -> object:
+        if isinstance(x, TensorBox) and (original_dtype := x.get_dtype()) in (
+            torch.float16,
+            torch.bfloat16,
+        ):
+            x_fp32 = to_dtype(x, torch.float32)
+            result_fp32 = original_op(x_fp32)
+            assert isinstance(result_fp32, TensorBox)
+            return to_dtype(result_fp32, original_dtype)
+        return original_op(x)
+
+    return fp32_fallback_lowering
+
+
+# Operations that need fp32 fallbacks due to libdevice/tl_math limitations
+FP32_FALLBACK_OPS_UNARY = [
+    torch.ops.aten.rsqrt.default,
+    torch.ops.aten.sqrt.default,
+    torch.ops.aten.sin.default,
+    torch.ops.aten.cos.default,
+    torch.ops.aten.log.default,
+    torch.ops.aten.tanh.default,
+    torch.ops.aten.log1p.default,
+    torch.ops.aten.expm1.default,
+    torch.ops.aten.exp.default,
+]
+
+# Register fp32 fallback lowerings for ops that don't support fp16/bfloat16
+for op in FP32_FALLBACK_OPS_UNARY:
+    inductor_lowering_dispatch[op] = create_fp16_to_fp32_unary_fallback_lowering(
+        original_lowerings[op]
+    )
+
+
 @contextlib.contextmanager
 def patch_inductor_lowerings() -> Generator[None, Any, Any]:
     """Context manager to temporarily patch the inductor lowering table.
diff --git a/test/test_reductions.expected b/test/test_reductions.expected
@@ -62,6 +62,96 @@ def reduce_kernel(x: torch.Tensor, fn: Callable[[torch.Tensor], torch.Tensor], o
     _launcher(_reduce_kernel_kernel, (n,), x, out, out.size(0), x.size(0), x.size(1), out.stride(0), x.stride(0), x.stride(1), _m, _REDUCTION_BLOCK_1, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestReductions.test_fp16_math_ops_fp32_fallback)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_compat import libdevice
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _rsqrt_fp16_kernel_kernel(x, result, x_size_0, result_stride_0, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_0 = load.to(tl.float32)
+    v_1 = libdevice.rsqrt(v_0)
+    v_2 = v_1.to(tl.float16)
+    tl.store(result + indices_0 * result_stride_0, v_2, mask_0)
+
+def rsqrt_fp16_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
+    result = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _launcher(_rsqrt_fp16_kernel_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, result, x.size(0), result.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return result
+
+--- assertExpectedJournal(TestReductions.test_fp16_math_ops_fp32_fallback)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_helpers import math as tl_math
+from torch._inductor.runtime.triton_compat import libdevice
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _multi_math_ops_fp16_kernel_kernel(x, result, x_size_0, result_stride_0, result_stride_1, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_0 = load.to(tl.float32)
+    v_1 = libdevice.rsqrt(v_0)
+    v_2 = v_1.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 0 * result_stride_1), v_2, mask_0)
+    load_1 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_3 = load_1.to(tl.float32)
+    v_4 = libdevice.sqrt(v_3)
+    v_5 = v_4.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 1 * result_stride_1), v_5, mask_0)
+    load_2 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_6 = load_2.to(tl.float32)
+    v_7 = tl_math.sin(v_6)
+    v_8 = v_7.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 2 * result_stride_1), v_8, mask_0)
+    load_3 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_9 = load_3.to(tl.float32)
+    v_10 = tl_math.cos(v_9)
+    v_11 = v_10.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 3 * result_stride_1), v_11, mask_0)
+    load_4 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_12 = load_4.to(tl.float32)
+    v_13 = tl_math.log(v_12)
+    v_14 = v_13.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 4 * result_stride_1), v_14, mask_0)
+    load_5 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_15 = load_5.to(tl.float32)
+    v_16 = libdevice.tanh(v_15)
+    v_17 = v_16.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 5 * result_stride_1), v_17, mask_0)
+    load_6 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_18 = load_6.to(tl.float32)
+    v_19 = libdevice.log1p(v_18)
+    v_20 = v_19.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 6 * result_stride_1), v_20, mask_0)
+    load_7 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_21 = load_7.to(tl.float32)
+    v_22 = tl_math.exp(v_21)
+    v_23 = v_22.to(tl.float16)
+    tl.store(result + (indices_0 * result_stride_0 + 7 * result_stride_1), v_23, mask_0)
+
+def multi_math_ops_fp16_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
+    result = torch.empty([x.size(0), 8], dtype=x.dtype, device=x.device)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_multi_math_ops_fp16_kernel_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, result, x.size(0), result.stride(0), result.stride(1), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return result
+
 --- assertExpectedJournal(TestReductions.test_fp16_var_mean)
 from __future__ import annotations
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
@@ -230,6 +230,107 @@ def layer_norm_fwd_repro(
         self.assertExpectedJournal(code2)
         torch.testing.assert_close(result1, result2, rtol=1e-3, atol=1e-3)
 
+    def test_fp16_math_ops_fp32_fallback(self):
+        """Test that mathematical ops with fp16/bfloat16 inputs now work via fp32 fallback."""
+
+        @helion.kernel(use_default_config=True)
+        def rsqrt_fp16_kernel(x: torch.Tensor) -> torch.Tensor:
+            result = torch.empty_like(x)
+            for tile in hl.tile(x.size(0)):
+                # This should now work via fp32 fallback
+                result[tile] = torch.rsqrt(x[tile])
+            return result
+
+        @helion.kernel(use_default_config=True)
+        def multi_math_ops_fp16_kernel(x: torch.Tensor) -> torch.Tensor:
+            result = torch.empty([x.size(0), 8], dtype=x.dtype, device=x.device)
+            for tile in hl.tile(x.size(0)):
+                # Test multiple operations that have confirmed fallbacks
+                result[tile, 0] = torch.rsqrt(x[tile])
+                result[tile, 1] = torch.sqrt(x[tile])
+                result[tile, 2] = torch.sin(x[tile])
+                result[tile, 3] = torch.cos(x[tile])
+                result[tile, 4] = torch.log(x[tile])
+                result[tile, 5] = torch.tanh(x[tile])
+                result[tile, 6] = torch.log1p(x[tile])
+                result[tile, 7] = torch.exp(x[tile])
+            return result
+
+        # Test with float16 - should now succeed
+        x_fp16 = (
+            torch.abs(torch.randn([32], device=DEVICE, dtype=torch.float16)) + 0.1
+        )  # positive values for rsqrt
+
+        code, result = code_and_output(rsqrt_fp16_kernel, (x_fp16,))
+        self.assertExpectedJournal(code)
+
+        # Verify result is correct compared to PyTorch's rsqrt
+        expected = torch.rsqrt(x_fp16)
+        torch.testing.assert_close(result, expected, rtol=1e-3, atol=1e-3)
+
+        # Verify result maintains fp16 dtype
+        self.assertEqual(result.dtype, torch.float16)
+
+        # Test multiple math operations
+        x_multi = torch.abs(torch.randn([16], device=DEVICE, dtype=torch.float16)) + 0.1
+        code_multi, result_multi = code_and_output(
+            multi_math_ops_fp16_kernel, (x_multi,)
+        )
+        self.assertExpectedJournal(code_multi)
+
+        # Verify each operation's correctness
+        expected_rsqrt = torch.rsqrt(x_multi)
+        expected_sqrt = torch.sqrt(x_multi)
+        expected_sin = torch.sin(x_multi)
+        expected_cos = torch.cos(x_multi)
+        expected_log = torch.log(x_multi)
+        expected_tanh = torch.tanh(x_multi)
+        expected_log1p = torch.log1p(x_multi)
+        expected_exp = torch.exp(x_multi)
+
+        torch.testing.assert_close(
+            result_multi[:, 0], expected_rsqrt, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 1], expected_sqrt, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 2], expected_sin, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 3], expected_cos, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 4], expected_log, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 5], expected_tanh, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 6], expected_log1p, rtol=1e-3, atol=1e-3
+        )
+        torch.testing.assert_close(
+            result_multi[:, 7], expected_exp, rtol=1e-3, atol=1e-3
+        )
+
+        # Verify all results maintain fp16 dtype
+        self.assertEqual(result_multi.dtype, torch.float16)
+
+        # Test with bfloat16 if available
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            x_bf16 = (
+                torch.abs(torch.randn([32], device=DEVICE, dtype=torch.bfloat16)) + 0.1
+            )
+
+            code_bf16, result_bf16 = code_and_output(rsqrt_fp16_kernel, (x_bf16,))
+
+            # Verify bfloat16 result is correct
+            expected_bf16 = torch.rsqrt(x_bf16)
+            torch.testing.assert_close(result_bf16, expected_bf16, rtol=1e-2, atol=1e-2)
+
+            # Verify result maintains bfloat16 dtype
+            self.assertEqual(result_bf16.dtype, torch.bfloat16)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_views.expected b/test/test_views.expected
@@ -22,12 +22,14 @@ def _softmax_kernel(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1,
     amax = tl.max(_mask_to, 1)
     amax_1 = amax[:, None]
     v_0 = values - amax_1
-    v_1 = tl_math.exp(v_0)
-    _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_1, 0)
+    v_1 = v_0.to(tl.float32)
+    v_2 = tl_math.exp(v_1)
+    v_3 = v_2.to(tl.float16)
+    _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_3, 0)
     sum_1 = tl.sum(_mask_to_1, 1)
     sum_exp = sum_1[None, :]
-    v_2 = v_1 / sum_exp
-    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_2, mask_1[None, :])
+    v_4 = v_3 / sum_exp
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_4, mask_1[None, :])
 
 def softmax(x: torch.Tensor, *, _launcher=_default_launcher):
     n, _m = x.size()
@@ -57,12 +59,14 @@ def _softmax_kernel(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1,
     amax = tl.max(_mask_to, 1)
     amax_1 = tl.reshape(amax, [1, 1])
     v_0 = values - amax_1
-    v_1 = tl_math.exp(v_0)
-    _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_1, 0)
+    v_1 = v_0.to(tl.float32)
+    v_2 = tl_math.exp(v_1)
+    v_3 = v_2.to(tl.float16)
+    _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_3, 0)
     sum_1 = tl.sum(_mask_to_1, 1)
     sum_exp = tl.reshape(sum_1, [1, 1])
-    v_2 = v_1 / sum_exp
-    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_2, mask_1[None, :])
+    v_4 = v_3 / sum_exp
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_4, mask_1[None, :])
 
 def softmax(x: torch.Tensor, *, _launcher=_default_launcher):
     n, _m = x.size()