stable reverse

xaellison · xaellison · commit 30db77e9e680 · 2021-11-07T15:20:03.000-05:00
diff --git a/src/bitonic_sort/gpu.jl b/src/bitonic_sort/gpu.jl
@@ -64,27 +64,47 @@ end
     return lo, n
 end
 
+@inline function rev_lt(a :: T, b :: T, lt, rev :: Val{R}) where {T,R}
+    if R
+        return lt(b, a)
+    else
+        return lt(a, b)
+    end
+end
+
+@inline function rev_lt(a :: Tuple{T, J}, b :: Tuple{T, J}, lt, rev :: Val{R}) where {T, J, R}
+    if R
+        if a[1] == b[1]
+            return a[2] < b[2]
+        else
+            return lt(b[1], a[1])
+        end
+    else
+        return lt(a, b)
+    end
+end
+
 # Functions specifically for "large" bitonic steps (those that cannot use shmem)
 
 
-@inline function compare!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt) where {T,I<:Integer}
+@inline function compare!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt, rev) where {T,I}
     i1′, i2′ = i1 + one(I), i2 + one(I)
-    @inbounds if dir != lt(by(vals[i1′]), by(vals[i2′]))
+    @inbounds if dir != rev_lt(by(vals[i1′]), by(vals[i2′]), lt, rev)
         vals[i1′], vals[i2′] = vals[i2′], vals[i1′]
     end
 end
 
-@inline function compare!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt) where I
+@inline function compare!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt, rev) where I
     i1′, i2′ = i1 + one(I), i2 + one(I)
     vals, inds = vals_inds
     # comparing tuples of (value, index) guarantees stability of sort
-    @inbounds if dir != lt((by(vals[inds[i1′]]), inds[i1′]), (by(vals[inds[i2′]]), inds[i2′]))
+    @inbounds if dir != rev_lt((by(vals[inds[i1′]]), inds[i1′]), (by(vals[inds[i2′]]), inds[i2′]), lt, rev)
         inds[i1′], inds[i2′] = inds[i2′], inds[i1′]
     end
 end
 
 
-@inline function get_range_part1(n::I, index::I, k::I)::Tuple{I,I,Bool} where {I<:Integer}
+@inline function get_range_part1(n::I, index::I, k::I)::Tuple{I,I,Bool} where I
     lo = zero(I)
     dir = true
     for iter = one(I):k-one(I)
@@ -130,7 +150,7 @@ Note that to avoid synchronization issues, only one thread from each pair of
 indices being swapped will actually move data. This does mean half of the threads
 do nothing, but it works for non-power2 arrays while allowing direct indexing.
 """
-function comparator_kernel(vals, length_vals::I, k::I, j::I, by::F1, lt::F2) where {I,F1,F2}
+function comparator_kernel(vals, length_vals::I, k::I, j::I, by::F1, lt::F2, rev) where {I,F1,F2}
     index = (blockDim().x * (blockIdx().x - one(I))) + threadIdx().x - one(I)
 
     lo, n, dir = get_range(length_vals, index, k, j)
@@ -139,7 +159,7 @@ function comparator_kernel(vals, length_vals::I, k::I, j::I, by::F1, lt::F2) whe
         m = gp2lt(n)
         if lo <= index < lo + n - m
             i1, i2 = index, index + m
-            @inbounds compare!(vals, i1, i2, dir, by, lt)
+            @inbounds compare!(vals, i1, i2, dir, by, lt, rev)
         end
     end
     return
@@ -148,18 +168,18 @@ end
 
 # Functions for "small" bitonic steps (those that can use shmem)
 
-@inline function compare_small!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt) where {T,I<:Integer}
+@inline function compare_small!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt, rev) where {T,I}
     i1′, i2′ = i1 + one(I), i2 + one(I)
-    @inbounds if dir != lt(by(vals[i1′]), by(vals[i2′]))
+    @inbounds if dir != rev_lt(by(vals[i1′]), by(vals[i2′]), lt, rev)
         vals[i1′], vals[i2′] = vals[i2′], vals[i1′]
     end
 end
 
-@inline function compare_small!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt) where I
+@inline function compare_small!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt, rev) where I
     i1′, i2′ = i1 + one(I), i2 + one(I)
     vals, inds = vals_inds
     # comparing tuples of (value, index) guarantees stability of sort
-    @inbounds if dir != lt((by(vals[i1′]), inds[i1′]), (by(vals[i2′]), inds[i2′]))
+    @inbounds if dir != rev_lt((by(vals[i1′]), inds[i1′]), (by(vals[i2′]), inds[i2′]), lt, rev)
         vals[i1′], vals[i2′] = vals[i2′], vals[i1′]
         inds[i1′], inds[i2′] = inds[i2′], inds[i1′]
     end
@@ -172,7 +192,7 @@ all threads perform swaps accessible using shmem.
 
 Various negative exit values just for debugging.
 """
-@inline function block_range(n::I, block_index::I, k::I, j::I)::Tuple{I,I,Bool} where {I<:Integer}
+@inline function block_range(n::I, block_index::I, k::I, j::I)::Tuple{I,I,Bool} where I
     lo = zero(I)
     dir = true
     tmp = block_index * two(I)
@@ -236,7 +256,7 @@ array. Each view is indexed along block x dim: one view per pseudo-block
     vals_inds::Tuple{AbstractArray{T},AbstractArray{J}},
     index,
     in_range,
-) where {T,J<:Integer}
+) where {T,J}
     # NB: I tried creating both shmem arrays with `initialize_shmem!`
     # but the behavior changed - maybe it's necessary to alloc both before
     # writing to either?
@@ -284,7 +304,7 @@ This is captured by `pseudo_block_idx`.
 Note that this moves the array values copied within shmem, but doesn't copy them
 back to global the way it does for indices.
 """
-function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I, by::F1, lt::F2) where {I,F1,F2}
+function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I, by::F1, lt::F2, rev) where {I,F1,F2}
     pseudo_block_idx = (blockIdx().x - one(I)) * blockDim().y + threadIdx().y - one(I)
     # immutable info about the range used by this kernel
     _lo, _n, dir = block_range(length_c, pseudo_block_idx, k, j_0)
@@ -301,7 +321,7 @@ function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I, by::F1, l
             m = gp2lt(n)
             if lo <= index < lo + n - m
                 i1, i2 = index - _lo, index - _lo + m
-                compare_small!(swap, i1, i2, dir, by, lt)
+                compare_small!(swap, i1, i2, dir, by, lt, rev)
             end
         end
         lo, n = bisect_range(index, lo, n)
@@ -322,7 +342,13 @@ function bitonic_shmem(c, threads)
     return prod(threads) * sum(map(a -> sizeof(eltype(a)), c))
 end
 
-function bitonic_sort!(c; by = identity, lt = isless) where {T}
+"""
+Call bitonic sort on `c` which can be a CuArray of values to `sort!` or a tuple
+of values and an index array for doing `sortperm!`. Cannot provide a stable
+`sort!` although `sortperm!` is properly stable. To reverse, set `rev=true`
+rather than `lt=!isless` (otherwise stability of sortperm breaks down).
+"""
+function bitonic_sort!(c; by = identity, lt = isless, rev=false) where {T}
     c_len = if typeof(c) <: Tuple
         length(c[1])
     else
@@ -341,12 +367,12 @@ function bitonic_sort!(c; by = identity, lt = isless) where {T}
         for j = 1:j_final
 
             # use Int32 args for indexing --> ~10% faster kernels
-            args1 = (c, map(Int32, (c_len, k, j, j_final))..., by, lt)
+            args1 = (c, map(Int32, (c_len, k, j, j_final))..., by, lt, Val(rev))
             kernel1 = @cuda launch = false comparator_small_kernel(args1...)
             config1 = launch_configuration(kernel1.fun, shmem = threads -> bitonic_shmem(c, threads))
             threads1 = prevpow(2, config1.threads)
 
-            args2 = (c, map(Int32, (c_len, k, j))..., by, lt)
+            args2 = (c, map(Int32, (c_len, k, j))..., by, lt, Val(rev))
             kernel2 = @cuda launch = false comparator_kernel(args2...)
             config2 = launch_configuration(kernel2.fun, shmem = threads -> bitonic_shmem(c, threads))
             threads2 = prevpow(2, config2.threads)
@@ -372,10 +398,3 @@ function bitonic_sort!(c; by = identity, lt = isless) where {T}
         end
     end
 end
-
-#a = rand(Float32, 1_000_000)
-#c = CuArray(a)
-#I = CuArray(collect(1:length(c)))
-#bitonic_sort!((c, I))
-#synchronize()
-#@assert c[I] |> Array == sort(a)