Support sorting single-element tensors (#8040)

saagarjha · web-flow · commit 1d004a9deeb3 · 2025-09-03T22:02:21.000+01:00
As per #6769, reshape([]) creates a scalar rather than a tensor. This
breaks the sorting algorithm, so special case this situation.
diff --git a/python/test/unit/language/test_standard.py b/python/test/unit/language/test_standard.py
@@ -26,7 +26,7 @@ def test_maximum_minium(dtype, op, device):
 
 
 @pytest.mark.interpreter
-@pytest.mark.parametrize("M, N", [[1, 512], [8, 64], [256, 16], [512, 8]])
+@pytest.mark.parametrize("M, N", [[1, 1], [1, 512], [8, 64], [256, 16], [512, 8]])
 @pytest.mark.parametrize("k", [None, 8])
 @pytest.mark.parametrize("descending", [False, True])
 @pytest.mark.parametrize("dtype_str", ['int32', 'float16', 'float32', 'bfloat16'])
@@ -40,7 +40,7 @@ def sort_kernel(X, stride_xm, Z, stride_zm, M: tl.constexpr, N: tl.constexpr, k:
         offs_z_n = offs_x_n if k is None else tl.arange(0, k)
         offs_x = offs_m[:, None] * stride_xm + offs_x_n[None, :]
         x = tl.load(X + offs_x)
-        if k is None:
+        if k is None or x.numel < k:
             z = tl.sort(x, descending=descending)
         else:
             z = tl.topk(x, k)
@@ -51,7 +51,7 @@ def sort_kernel(X, stride_xm, Z, stride_zm, M: tl.constexpr, N: tl.constexpr, k:
     x = numpy_random((M, N), dtype_str=dtype_str)
     x = torch.from_numpy(x).to(device)
     z = torch.empty(z_shape, dtype=x.dtype, device=x.device)
-    if k is None:
+    if k is None or x.numel() < k:
         y = torch.sort(x, descending=descending)[0]
     else:
         y = torch.topk(x, k=k).values
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -441,7 +441,7 @@ def sort_impl(x, k: core.constexpr = None, dim: core.constexpr = None, descendin
     n_dims: core.constexpr = _log2(x.numel)
 
     # reshape to hypercube:
-    h = core.reshape(x, [2] * n_dims)
+    h = core.reshape(x, [2] * n_dims if n_dims else [1])
 
     # run first log_k bitonic sort iterations:
     for i in core.static_range(1, log_k + 1):