clean up initialize_shmem + docstrings

xaellison · xaellison · commit bf45a4b4592a · 2021-11-07T15:20:03.000-05:00
diff --git a/src/sorting.jl b/src/sorting.jl
@@ -503,10 +503,11 @@ shared memory. It provides a moderate speedup.
 Notation:
 `k`, `j` denote the level of the sorting network (equivalently, recursion depth).
 `vals` is the array of values of type `T` that is either being `sort`-ed or `sortperm`-ed.
-`inds` is an array of indices of type `J` that gets permuted in `sortperm!`.
+`inds` is an array of indices of type `J` that gets permuted in `sortperm!` (standard 1-indexed)
 `i1`, `i2` index either `vals` or `inds` depending on the operation.
 `lo`, `n`, and `m` are integers of type `I` used to denote/calculate ranges as
-    described in the recursive algorithm link above.
+    described in the recursive algorithm link above. Note these follow the 0-indexing
+    convention from the above source.
 """
 module BitonicSort
 export bitonic_sort!
@@ -729,6 +730,7 @@ end
 """
 For sort/sort! `c`, allocate and return shared memory view of `c`
 Each view is indexed along block x dim: one view per pseudo-block
+`index` is expected to be from a 0-indexing context
 """
 @inline function initialize_shmem!(vals::AbstractArray{T}, index::I, in_range, offset = zero(I)) where {T,I}
     swap = CuDynamicSharedArray(T, (blockDim().x, blockDim().y), offset)
@@ -741,26 +743,21 @@ end
 
 """
 For sortperm/sortperm!, allocate and return shared memory views of `c` and index
-array. Each view is indexed along block x dim: one view per pseudo-block
+array. Each view is indexed along block x dim: one view per pseudo-block.
+`index` is expected to be from a 0-indexing context, but the indices stored in
+`val_inds` are expected to be 1-indexed
 """
 @inline function initialize_shmem!(vals_inds::Tuple{AbstractArray{T},AbstractArray{J}}, index, in_range) where {T,J}
-    # NB: I tried creating both shmem arrays with `initialize_shmem!`
-    # but the behavior changed - maybe it's necessary to alloc both before
-    # writing to either?
-    offset = prod(blockDim()) * sizeof(T)
+    offset = prod(blockDim()) * sizeof(J)
     vals, inds = vals_inds
-    swap_vals = CuDynamicSharedArray(T, (blockDim().x, blockDim().y))
-    inds_view = initialize_shmem!(inds, index, in_range, offset)
-    vals_view = @view swap_vals[:, threadIdx().y]
-    if in_range
-        @inbounds vals_view[threadIdx().x] = vals[inds_view[threadIdx().x]]
-    end
-    sync_threads()
+    inds_view = initialize_shmem!(inds, index, in_range)
+    vals_view = initialize_shmem!(vals, inds_view[threadIdx().x] - one(J), in_range, offset)
     return vals_view, inds_view
 end
 
 """
 For sort/sort!, copy shmem view `swap` back into global array `c`
+`index` is expected to be from a 0-indexing context
 """
 @inline function finalize_shmem!(vals::AbstractArray, swap::AbstractArray, index::I, in_range::Bool) where {I}
     if in_range
@@ -770,6 +767,8 @@ end
 
 """
 For sortperm/sortperm!, copy shmem view `swap` back to global index array
+`index` is expected to be from a 0-indexing context, but the indices stored in
+`val_inds` are expected to be 1-indexed
 """
 @inline function finalize_shmem!(vals_inds::Tuple, swap::Tuple, index, in_range::Bool)
     vals, inds = vals_inds