JuliaGPU
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/src/api/sort.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/src/api/sort.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/src/api/task_partition.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/src/api/task_partition.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/src/assets/banner.png‎
-105 KB b/‎docs/src/assets/banner.png‎
-105 KB
diff --git a/‎docs/src/assets/logo.png‎
535 KB b/‎docs/src/assets/logo.png‎
535 KB
diff --git a/‎prototype/parallel_sample_sort/Project.toml‎
Lines changed: 11 additions & 0 deletions b/‎prototype/parallel_sample_sort/Project.toml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎prototype/parallel_sample_sort/ak_test.jl‎
Lines changed: 12 additions & 0 deletions b/‎prototype/parallel_sample_sort/ak_test.jl‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎prototype/parallel_sample_sort/mwe.jl‎
Lines changed: 64 additions & 0 deletions b/‎prototype/parallel_sample_sort/mwe.jl‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎prototype/parallel_sample_sort/mwe2.jl‎
Lines changed: 171 additions & 0 deletions b/‎prototype/parallel_sample_sort/mwe2.jl‎
Lines changed: 171 additions & 0 deletions
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <[email protected]> and contributors"]
-version = "0.3.3"
+version = "0.3.4"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 
@@ -1,4 +1,4 @@
-[![AcceleratedKernels.jl](https://github.com/juliagpu/AcceleratedKernels.jl/blob/main/docs/src/assets/banner.png?raw=true)](https://juliagpu.github.io/AcceleratedKernels.jl)
+[![AcceleratedKernels.jl](https://github.com/juliagpu/AcceleratedKernels.jl/blob/main/docs/src/assets/logo.png?raw=true)](https://juliagpu.github.io/AcceleratedKernels.jl)
 
 *"We need more speed" - Lightning McQueen or Scarface, I don't know*
 
@@ -232,6 +232,7 @@ If you need other algorithms in your work that may be of general use, please ope
 | [General Looping](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/foreachindex/) | `foreachindex`, `foraxes`                        | `Kokkos::parallel_for` `RAJA::forall` `thrust::transform` |
 | [Mapping](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/map/) | `map` `map!`                                     | `thrust::transform`                                       |
 | [Sorting](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/sort/) | `sort` `sort!`                                   | `sort` `sort_team` `stable_sort`                          |
+|                                               | `sample_sort!` `sample_sortperm!`                |                                                           |
 |                                               | `merge_sort` `merge_sort!`                       |                                                           |
 |                                               | `merge_sort_by_key` `merge_sort_by_key!`         | `sort_team_by_key`                                        |
 |                                               | `sortperm` `sortperm!`                           | `sort_permutation` `index_permutation`                    |
 
@@ -14,12 +14,15 @@ AcceleratedKernels.sortperm
 ```
 
 Specific implementations that the interfaces above forward to:
+- `sample_sort!` - multithreaded CPU sample sort, deferring to Base.sort! on independent slices.
 - `merge_sort!` (in-place), `merge_sort` (out-of-place) - sort arbitrary objects with custom comparisons.
 - `merge_sort_by_key!`, `merge_sort_by_key` - sort a vector of keys along with a "payload", a vector of corresponding values.
 - `merge_sortperm!`, `merge_sortperm`, `merge_sortperm_lowmem!`, `merge_sortperm_lowmem` - compute a sorting index permutation. 
 
 Function signatures:
 ```@docs
+AcceleratedKernels.sample_sort!
+AcceleratedKernels.sample_sortperm!
 AcceleratedKernels.merge_sort!
 AcceleratedKernels.merge_sort
 AcceleratedKernels.merge_sort_by_key!
 
@@ -3,4 +3,5 @@
 ```@docs
 AcceleratedKernels.TaskPartitioner
 AcceleratedKernels.task_partition
+AcceleratedKernels.itask_partition
 ```
@@ -0,0 +1,11 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
+Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+SyncBarriers = "3986aa12-c984-439b-887a-f8545bea0e93"
@@ -0,0 +1,12 @@
+
+import AcceleratedKernels as AK
+using Random
+Random.seed!(0)
+
+v = rand(1:100, 1_000_000)
+AK.sort!(v)
+@assert issorted(v)
+
+v = rand(1:100, 1_000_000)
+ix = AK.sortperm(v)
+@assert issorted(v[ix])
@@ -0,0 +1,64 @@
+using BenchmarkTools
+
+
+# @check_allocs ignore_throw=false
+function sample_sort_histogram!(
+    v::AbstractVector{T},
+    splitters::Vector{T},
+    histograms::Matrix{Int},
+    itask, irange,
+) where T
+
+    @inbounds begin
+
+        # Compute the bucket histograms for this task
+        for i in irange
+
+            # Find the bucket for this element
+            ibucket = 1 + searchsortedlast(splitters, v[i])
+
+            # Increment the histogram for this task
+            histograms[ibucket, itask] += 1
+        end
+    end
+
+    nothing
+end
+
+
+function sample_sort_parallel!(v, splitters, histograms, max_tasks)
+    # Compute the histogram for each task - i.e. the number of elements in each bucket
+    tasks = Vector{Task}(undef, max_tasks)
+    for itask in 1:max_tasks
+        irange = div((itask - 1) * length(v), max_tasks) + 1 : div(itask * length(v), max_tasks)
+        # @show irange
+        tasks[itask] = Threads.@spawn sample_sort_histogram!(
+            v,
+            splitters, histograms,
+            itask, irange,
+        )
+    end
+
+    # Wait for all tasks to finish
+    for itask in 1:max_tasks
+        wait(tasks[itask])
+    end
+
+    nothing
+end
+
+
+function sample_sort!(
+    v;
+    max_tasks=Threads.nthreads(),
+)
+    splitters = Vector(range(0, 1, length=max_tasks + 1)[2:end-1])
+    histograms = zeros(Int, max_tasks + 8, max_tasks)           # padding to avoid false sharing
+    sample_sort_parallel!(v, splitters, histograms, max_tasks)
+end
+
+
+v = rand(1_000_000)
+
+@benchmark sample_sort!(v)
+
@@ -0,0 +1,171 @@
+
+using StaticArrays
+using SyncBarriers
+using BenchmarkTools
+import AcceleratedKernels as AK
+
+
+using AllocCheck
+
+using Random
+Random.seed!(0)
+
+
+
+
+# @check_allocs ignore_throw=false
+function _sample_sort_histogram!(v, splitters, histograms, itask, irange)
+    for i in irange
+        ibucket = 1 + AK._searchsortedlast(splitters, v[i], 1, length(splitters), isless)
+        histograms[ibucket, itask] += 1
+    end
+    nothing
+end
+
+
+# @check_allocs ignore_throw=false
+function _sample_sort_parallel!(
+    v, dest, comp,
+    splitters, histograms,
+    max_tasks,
+)
+    # Compute the histogram for each task
+    AK.itask_partition(length(v), max_tasks, 1) do itask, irange
+        _sample_sort_histogram!(
+            v,
+            splitters, histograms,
+            itask, irange,
+        )
+    end
+    nothing
+end
+
+
+
+function sample_sort!(
+    v;
+    max_tasks=Threads.nthreads(),
+
+    lt=isless,
+    by=identity,
+    rev::Union{Bool, Nothing}=nothing,
+    order::Base.Order.Ordering=Base.Order.Forward,
+
+    temp=nothing
+)
+
+    oversampling_factor = 4
+    num_elements = length(v)
+
+    if num_elements < 2
+        return v
+    end
+
+    if max_tasks == 1 || num_elements < oversampling_factor * max_tasks
+        return sort!(v, lt=lt, by=by, rev=rev, order=order)
+    end
+
+    # Create a temporary buffer for the sorted output
+    if temp === nothing
+        dest = similar(v)
+    else
+        # TODO add checks
+        dest = temp
+    end
+
+    # Construct comparator
+    ord = Base.Order.ord(lt, by, rev, order)
+    comp = (x, y) -> Base.Order.lt(ord, x, y)
+
+    # Take equally spaced samples, save them in dest
+    num_samples = oversampling_factor * max_tasks
+    isamples = IntLinSpace(1, num_elements, num_samples)
+    @inbounds for i in 1:num_samples
+        dest[i] = v[isamples[i]]
+    end
+
+    # Sort samples and choose splitters
+    sort!(view(dest, 1:num_samples), lt=lt, by=by, rev=rev, order=order)
+    splitters = Vector{eltype(v)}(undef, max_tasks - 1)
+    for i in 1:(max_tasks - 1)
+        splitters[i] = dest[div(i * num_samples, max_tasks)]
+    end
+
+    # Pre-allocate histogram for each task; each column is exclusive to the task
+    histograms = zeros(Int, max_tasks + 8, max_tasks)       # Add padding to avoid false sharing
+
+    # Run threaded region
+    _sample_sort_parallel!(
+        v, dest, comp,
+        splitters, histograms,
+        max_tasks,
+    )
+
+    dest
+end
+
+
+
+
+
+# Utilities
+
+
+# Create an integer linear space between start and stop on demand
+struct IntLinSpace{T <: Integer}
+    start::T
+    stop::T
+    length::T
+end
+
+function IntLinSpace(start::Integer, stop::Integer, length::Integer)
+    start <= stop || throw(ArgumentError("`start` must be <= `stop`"))
+    length >= 2 || throw(ArgumentError("`length` must be >= 2"))
+
+    IntLinSpace{typeof(start)}(start, stop, length)
+end
+
+Base.IndexStyle(::IntLinSpace) = IndexLinear()
+Base.length(ils::IntLinSpace) = ils.length
+
+Base.firstindex(::IntLinSpace) = 1
+Base.lastindex(ils::IntLinSpace) = ils.length
+
+function Base.getindex(ils::IntLinSpace, i)
+    @boundscheck 1 <= i <= ils.length || throw(BoundsError(ils, i))
+
+    if i == 1
+        ils.start
+    elseif i == length
+        ils.stop
+    else
+        ils.start + div((i - 1) * (ils.stop - ils.start), ils.length - 1, RoundUp)
+    end
+end
+
+
+
+
+
+
+
+
+v = rand(Float32, 100_000)
+
+try
+    temp = sample_sort!(v)
+catch e
+    display(e.errors[1])
+    rethrow(e)
+end
+
+
+t = @timed sample_sort!(v)
+
+
+# @assert issorted(temp)
+# println("sorted")
+
+
+# display(@benchmark sort!(v) setup=(v=rand(Float64, 10_000_000)))
+display(@benchmark sample_sort!(v) setup=(v=rand(Float64, 100_000)))