Revert recent changes to tl.sort (triton-lang#6489)

peterbell10 · web-flow · commit 7efc69477ea9 · 2025-04-15T00:25:07.000Z
This reverts: - triton-lang#6486 - triton-lang#6461 As the latter is causing catastrophic compile time regressions in internal workloads. [OAI ref](https://buildkite.com/openai-mono/async-tests/builds/16445#019633be-2ab6-47ca-b4f9-a9abe08f75d5:~:text=%F0%9F%94%BA%20triton%20nightly%20%2D%20%40zahi%20%2D%20tritium%3A-,Run,-H100%20deuterium%200)
diff --git a/python/test/unit/language/test_standard.py b/python/test/unit/language/test_standard.py
@@ -27,35 +27,24 @@ def test_maximum_minium(dtype, op, device):
 
 @pytest.mark.interpreter
 @pytest.mark.parametrize("M, N", [[1, 512], [8, 64], [256, 16], [512, 8]])
-@pytest.mark.parametrize("k", [None, 8])
 @pytest.mark.parametrize("descending", [False, True])
 @pytest.mark.parametrize("dtype_str", ['int32', 'float16', 'float32', 'bfloat16'])
-def test_sort(M, N, k, descending, dtype_str, device):
+def test_sort(M, N, descending, dtype_str, device):
 
     @triton.jit
-    def sort_kernel(X, stride_xm, Z, stride_zm, M: tl.constexpr, N: tl.constexpr, k: tl.constexpr,
-                    descending: tl.constexpr):
-        offs_m = tl.arange(0, M)
-        offs_x_n = tl.arange(0, N)
-        offs_z_n = offs_x_n if k is None else tl.arange(0, k)
-        offs_x = offs_m[:, None] * stride_xm + offs_x_n[None, :]
-        x = tl.load(X + offs_x)
-        if k is None:
-            z = tl.sort(x, descending=descending)
-        else:
-            z = tl.topk(x, k)
-        offs_z = offs_m[:, None] * stride_zm + offs_z_n[None, :]
-        tl.store(Z + offs_z, z)
-
-    z_shape = (M, N if k is None else k)
-    x = numpy_random((M, N), dtype_str=dtype_str)
+    def sort_kernel(X, Z, N: tl.constexpr, M: tl.constexpr, descending: tl.constexpr):
+        offx = tl.arange(0, M)
+        offy = tl.arange(0, N) * M
+        off2d = offx[None, :] + offy[:, None]
+        x = tl.load(X + off2d)
+        x = tl.sort(x, descending=descending)
+        tl.store(Z + off2d, x)
+
+    x = numpy_random((N, M), dtype_str=dtype_str)
     x = torch.from_numpy(x).to(device)
-    z = torch.empty(z_shape, dtype=x.dtype, device=x.device)
-    if k is None:
-        y = torch.sort(x, descending=descending)[0]
-    else:
-        y = torch.topk(x, k=k).values
-    sort_kernel[(1, )](x, x.stride(0), z, z.stride(0), M, N, k, descending, num_warps=8)
+    y = torch.sort(x, descending=descending)[0]
+    z = torch.empty_like(x)
+    sort_kernel[(1, )](x, z, N, M, descending, num_warps=8)
     assert (y == z).all(), (y, z)
 
 
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
@@ -19,7 +19,6 @@
     sort,
     sum,
     swizzle2d,
-    topk,
     xor_sum,
     zeros,
     zeros_like,
@@ -253,7 +252,6 @@
     "sum",
     "swizzle2d",
     "tensor",
-    "topk",
     "trans",
     "tuple",
     "uint16",
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -9,7 +9,7 @@
 
 def _log2(i: core.constexpr):
     log2 = 0
-    n = core.constexpr(i).value
+    n = i.value
     while n > 1:
         n >>= 1
         log2 += 1
@@ -338,19 +338,20 @@ def cumprod(input, axis=0, reverse=False):
 def _compare_and_swap(x, flip, i: core.constexpr, n_dims: core.constexpr):
     n_outer: core.constexpr = x.numel >> n_dims
     shape: core.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
-
-    # flip along middle dimension (the bitwise XORs will be optimised away):
+    y = core.reshape(x, shape)
+    # slice left/right with 'stride' 2**(n_dims - i - 1)
+    mask = core.arange(0, 2)[None, :, None]
+    left = core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype)
+    right = core.broadcast_to(sum(y * mask, 1)[:, None, :], shape).to(y.dtype)
+    left = core.reshape(left, x.shape)
+    right = core.reshape(right, x.shape)
+    # actual compare-and-swap
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    ix = core.reshape(x, shape).to(idtype, bitcast=True)
-    iy = ix ^ xor_sum(ix, 1, True)
-    y = core.reshape(iy.to(x.dtype, bitcast=True), x.shape)
-
-    # determines whether we are in the right (rather than left) position along the axis:
-    is_right = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
-
-    # conditional swap:
-    ret = core.where((x > y) != (flip ^ is_right), y, x)
-    return ret
+    ileft = left.to(idtype, bitcast=True)
+    iright = right.to(idtype, bitcast=True)
+    ix = x.to(idtype, bitcast=True)
+    ret = ix ^ core.where((left > right) != flip, ileft ^ iright, zeros_like(ix))
+    return ret.to(x.dtype, bitcast=True)
 
 
 @jit
@@ -361,14 +362,14 @@ def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core
     order_type 2 == alternating
     '''
     n_outer: core.constexpr = x.numel >> n_dims
+    core.static_assert(stage <= n_dims)
     # flip denotes whether to re-arrange sub-sequences of elements in ascending or
     # descending order.
     # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
     # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
     # a stride of 2) at this stage
     if order == 2:
-        core.static_assert(stage <= (n_dims))
-        shape: core.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**(stage)]
+        shape: core.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage]
         flip = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
     else:
         flip = order
@@ -378,47 +379,30 @@ def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core
     return x
 
 
+@core._tensor_member_fn
 @jit
-def sort_impl(x, k: core.constexpr = None, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
     """
     Sorts a tensor along a specified dimension.
 
     :param x: The input tensor to be sorted.
     :type x: Tensor
     :param dim: The dimension along which to sort the tensor. If None, the tensor is sorted along the last dimension. Currently, only sorting along the last dimension is supported.
     :type dim: int, optional
-    :param k: the number of top elements to select. If none, assume k = x.shape[dim]
-    :type k: int, optional
     :param descending: If set to True, the tensor is sorted in descending order. If set to False, the tensor is sorted in ascending order.
     :type descending: bool, optional
     """
     # handle default dimension or check that it is the most minor dim
     _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
     core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
     # iteratively run bitonic merge-sort steps
-    n_outer: core.constexpr = x.numel >> _log2(x.shape[_dim])
-    log_n: core.constexpr = _log2(x.shape[_dim])
-    log_k: core.constexpr = log_n if k is None else _log2(k)
-    for i in core.static_range(1, log_k + 1):
-        x = _bitonic_merge(x, i, 2 if i < log_n else descending, log_n)
-    # select top k elements using bitonic top-k
-    # https://www.doc.ic.ac.uk/~hlgr/pdfs/MassivelyParallelTopK.pdf
-    for i in core.static_range(log_k + 1, log_n + 1):
-        x = core.reshape(x, [n_outer * 2**(log_n - i), 2, 2**log_k])
-        x = max(x, axis=1) if descending else min(x, axis=1)
-        x = core.reshape(x, [n_outer, 2**(log_n - i + log_k)])
-        x = _bitonic_merge(x, log_k, 2 if i < log_n else descending, _log2(x.shape[_dim]))
+    n_dims: core.constexpr = _log2(x.shape[_dim])
+    for i in core.static_range(1, n_dims + 1):
+        x = _bitonic_merge(x, i, 2 if i < n_dims else descending, n_dims)
     return x
 
 
-@jit
-def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
-    return sort_impl(x, dim=dim, descending=descending)
-
-
-@jit
-def topk(x, k: core.constexpr, dim: core.constexpr = None):
-    return sort_impl(x, k=k, dim=dim, descending=True)
+# flip
 
 
 def _get_flip_dim(dim, shape):
@@ -450,8 +434,14 @@ def flip(x, dim=None):
 
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
     y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
+    y = core.expand_dims(y, start)
+    flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
     for i in core.static_range(start, steps):
-        y = y ^ xor_sum(y, i, True)
+        flip2 = flip
+        for j in core.static_range(0, steps + 1):
+            if j != i and j != i + 1:
+                flip2 = core.expand_dims(flip2, j)
+        y = sum(y * flip2, i + 1, keep_dims=True, dtype=y.dtype)
     x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x