Use xor-swap trick to simplify tl.sort and tl.flip (triton-lang#6486)

apgoucher · web-flow · commit 191ece36089e · 2025-04-14T21:05:55.000+01:00
This improves the runtime of an internal radix sort benchmark by 25%
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -338,20 +338,19 @@ def cumprod(input, axis=0, reverse=False):
 def _compare_and_swap(x, flip, i: core.constexpr, n_dims: core.constexpr):
     n_outer: core.constexpr = x.numel >> n_dims
     shape: core.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
-    y = core.reshape(x, shape)
-    # slice left/right with 'stride' 2**(n_dims - i - 1)
-    left, right = core.split(core.permute(y, (0, 2, 1)))
-    left = core.reshape(core.broadcast_to(left[:, None, :], shape), x.shape)
-    right = core.reshape(core.broadcast_to(right[:, None, :], shape), x.shape)
-    left = left.to(y.dtype)
-    right = right.to(y.dtype)
-    # actual compare-and-swap
+
+    # flip along middle dimension (the bitwise XORs will be optimised away):
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    ileft = left.to(idtype, bitcast=True)
-    iright = right.to(idtype, bitcast=True)
-    ix = x.to(idtype, bitcast=True)
-    ret = ix ^ core.where((left > right) != flip, ileft ^ iright, zeros_like(ix))
-    return ret.to(x.dtype, bitcast=True)
+    ix = core.reshape(x, shape).to(idtype, bitcast=True)
+    iy = ix ^ xor_sum(ix, 1, True)
+    y = core.reshape(iy.to(x.dtype, bitcast=True), x.shape)
+
+    # determines whether we are in the right (rather than left) position along the axis:
+    is_right = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
+
+    # conditional swap:
+    ret = core.where((x > y) != (flip ^ is_right), y, x)
+    return ret
 
 
 @jit
@@ -451,14 +450,8 @@ def flip(x, dim=None):
 
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
     y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
-    y = core.expand_dims(y, start)
-    flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
     for i in core.static_range(start, steps):
-        flip2 = flip
-        for j in core.static_range(0, steps + 1):
-            if j != i and j != i + 1:
-                flip2 = core.expand_dims(flip2, j)
-        y = sum(y * flip2, i + 1, keep_dims=True, dtype=y.dtype)
+        y = y ^ xor_sum(y, i, True)
     x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x