[FRONTEND] Allow arbitrary dim in tl.flip (#6853)

latentCall145 · web-flow · commit 8d99aa158da4 · 2025-05-18T00:04:00.000+01:00
Currently, `tl.flip` only supports flipping along the last dimension. We can still use the same algorithm to flip any dimension as long as we tweak where the swaps begin/end. # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/test/unit/language/test_standard.py b/python/test/unit/language/test_standard.py
@@ -65,24 +65,26 @@ def sort_kernel(X, stride_xm, Z, stride_zm, M: tl.constexpr, N: tl.constexpr, k:
 
 
 @pytest.mark.interpreter
-@pytest.mark.parametrize("M, N", [[1, 512], [8, 64], [256, 16], [512, 8]])
+@pytest.mark.parametrize("M, N, K", [[1, 16, 64], [8, 2, 256], [32, 1, 2], [128, 8, 1]])
 @pytest.mark.parametrize("dtype_str", ['int32', 'float16', 'float32', 'bfloat16'])
-def test_flip(M, N, dtype_str, device):
+@pytest.mark.parametrize("dim", [0, 1, 2, -2])
+def test_flip(M, N, K, dtype_str, dim, device):
 
     @triton.jit
-    def flip_kernel(X, Z, N: tl.constexpr, M: tl.constexpr):
-        offx = tl.arange(0, M)
-        offy = tl.arange(0, N) * M
-        off2d = offx[None, :] + offy[:, None]
-        x = tl.load(X + off2d)
-        x = tl.flip(x)
-        tl.store(Z + off2d, x)
-
-    x = numpy_random((N, M), dtype_str=dtype_str)
+    def flip_kernel(X, Z, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr, dim: tl.constexpr):
+        offx = tl.arange(0, M) * N * K
+        offy = tl.arange(0, N) * K
+        offz = tl.arange(0, K)
+        off3d = offx[:, None, None] + offy[None, :, None] + offz[None, None, :]
+        x = tl.load(X + off3d)
+        x = tl.flip(x, dim)
+        tl.store(Z + off3d, x)
+
+    x = numpy_random((M, N, K), dtype_str=dtype_str)
     x = torch.from_numpy(x).to(device)
-    y = torch.flip(x, (1, ))
+    y = torch.flip(x, (dim, ))
     z = torch.empty_like(x, device=device)
-    flip_kernel[(1, )](x, z, N, M, num_warps=8)
+    flip_kernel[(1, )](x, z, M, N, K, dim, num_warps=8)
     assert (y == z).all(), (y, z)
 
 
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -475,7 +475,8 @@ def _get_flip_dim(dim, shape):
     shape = core._unwrap_if_constexpr(shape)
     if dim is None:
         dim = len(shape) - 1
-    assert dim == len(shape) - 1, "Currently only support flipping the last dimension"
+    if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
+        dim += len(shape)
     return core.constexpr(dim)
 
 
@@ -487,20 +488,19 @@ def flip(x, dim=None):
 
     :param x: the first input tensor
     :type x: Block
-    :param dim: the dimension to flip along (currently only final dimension supported)
+    :param dim: the dimension to flip along
     :type dim: int
     """
-    core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
-    core.static_assert(_is_power_of_two(x.numel))
-    # reshape the tensor to have all dimensions be 2.
-    # TODO: We shouldn't have to change the dimensions not sorted.
-    steps: core.constexpr = _log2(x.numel)
-    start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
+    core.static_assert(-len(x.shape) <= dim and dim < len(x.shape))
+    _dim: core.constexpr = _get_flip_dim(dim, x.shape)
+    core.static_assert(_is_power_of_two(x.shape[_dim]))
+    steps: core.constexpr = _log2(x.shape[_dim])
 
+    # reshape the swap dimension to (2, 2, ..., 2)
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
-    for i in core.static_range(start, steps):
-        y = y ^ xor_sum(y, i, True)
+    y = core.reshape(x.to(idtype, bitcast=True), x.shape[:_dim] + core.tuple([2] * steps) + x.shape[_dim + 1:])
+    for i in core.static_range(steps):
+        y = y ^ xor_sum(y, _dim + i, True)
     x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x