[STANDARD] Fix inf handling in tl.flip (#5447)

peterbell10 · web-flow · commit a52c88aa128e · 2024-12-17T21:00:15.000Z
Fixes #5439 Currently we end up doing `0 * inf = nan`, the fix is to bitcast to int first where `x * 0 == 0` holds.
diff --git a/python/test/unit/language/test_standard.py b/python/test/unit/language/test_standard.py
@@ -75,6 +75,29 @@ def flip_kernel(X, Z, N: tl.constexpr, M: tl.constexpr):
     assert (y == z).all(), (y, z)
 
 
+@pytest.mark.interpreter
+def test_flip_inf(device):
+    # Reproducer for https://github.com/triton-lang/triton/issues/5439
+
+    @triton.jit
+    def triton_flip_kernel(out_ptr, x_ptr, N: tl.constexpr):
+        pid = tl.program_id(0)
+        x = tl.load(x_ptr + pid * N + tl.arange(0, N))
+        shape: tl.constexpr = (N // 2, 2)
+        y = x.reshape(shape)
+        y = tl.flip(y, dim=1).reshape(x.shape)
+        tl.store(out_ptr + pid * N + tl.arange(0, N), y)
+
+    x = torch.arange(0, 16, device=device).unsqueeze(0).float()
+    x[:, -1] = float('inf')
+
+    expect = x.reshape(-1, 8, 2).flip(-1).reshape(-1, 16)
+    actual = torch.empty_like(x)
+    triton_flip_kernel[(x.shape[0], )](actual, x, x.shape[1])
+
+    torch.testing.assert_close(expect, actual)
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("size_i, size_j, size_g", [[5, 7, 3]])
 def test_swizzle2d(size_i, size_j, size_g, device):
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -412,11 +412,13 @@ def flip(x, dim=None):
     """
     core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
     core.static_assert(_is_power_of_two(x.numel))
-    # # reshape the tensor to have all dimensions be 2.
-    # # TODO: We shouldn't have to change the dimensions not sorted.
+    # reshape the tensor to have all dimensions be 2.
+    # TODO: We shouldn't have to change the dimensions not sorted.
     steps: core.constexpr = _log2(x.numel)
     start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
-    y = core.reshape(x, [2] * steps)
+
+    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
     y = core.expand_dims(y, start)
     flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
     for i in core.static_range(start, steps):
@@ -425,7 +427,7 @@ def flip(x, dim=None):
             if j != i and j != i + 1:
                 flip2 = core.expand_dims(flip2, j)
         y = sum(y * flip2, i + 1, keep_dims=True)
-    x = core.reshape(y, x.shape)
+    x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x
 
 
diff --git a/python/triton/runtime/interpreter.py b/python/triton/runtime/interpreter.py
@@ -726,10 +726,12 @@ def check_tensor(self, input):
             self.check_axis(arg.shape, self.axis)
 
     def to_tensor(self, ret, dtype):
+        np_dtype = _get_np_dtype(dtype)
         if hasattr(ret, "shape") and ret.shape:
+            ret = ret.astype(np_dtype)
             ret_type = tl.block_type(dtype, list(ret.shape))
         else:
-            ret = np.array([ret]).astype(_get_np_dtype(dtype))
+            ret = np.array([ret], dtype=np_dtype)
             ret_type = dtype
         return tl.core.tensor(TensorHandle(ret, dtype.scalar), ret_type)