Skip to content

Commit afce0d2

Browse files
authored
Merge pull request #34 from JuliaGPU/sample_sort
Multithreaded CPU sample sort
2 parents 0b99fbf + 146fda8 commit afce0d2

22 files changed

+1334
-104
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "AcceleratedKernels"
22
uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
33
authors = ["Andrei-Leonard Nicusan <[email protected]> and contributors"]
4-
version = "0.3.3"
4+
version = "0.3.4"
55

66
[deps]
77
ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
[![AcceleratedKernels.jl](https://github.com/juliagpu/AcceleratedKernels.jl/blob/main/docs/src/assets/banner.png?raw=true)](https://juliagpu.github.io/AcceleratedKernels.jl)
1+
[![AcceleratedKernels.jl](https://github.com/juliagpu/AcceleratedKernels.jl/blob/main/docs/src/assets/logo.png?raw=true)](https://juliagpu.github.io/AcceleratedKernels.jl)
22

33
*"We need more speed" - Lightning McQueen or Scarface, I don't know*
44

@@ -232,6 +232,7 @@ If you need other algorithms in your work that may be of general use, please ope
232232
| [General Looping](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/foreachindex/) | `foreachindex`, `foraxes` | `Kokkos::parallel_for` `RAJA::forall` `thrust::transform` |
233233
| [Mapping](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/map/) | `map` `map!` | `thrust::transform` |
234234
| [Sorting](https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/sort/) | `sort` `sort!` | `sort` `sort_team` `stable_sort` |
235+
| | `sample_sort!` `sample_sortperm!` | |
235236
| | `merge_sort` `merge_sort!` | |
236237
| | `merge_sort_by_key` `merge_sort_by_key!` | `sort_team_by_key` |
237238
| | `sortperm` `sortperm!` | `sort_permutation` `index_permutation` |

docs/src/api/sort.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@ AcceleratedKernels.sortperm
1414
```
1515

1616
Specific implementations that the interfaces above forward to:
17+
- `sample_sort!` - multithreaded CPU sample sort, deferring to Base.sort! on independent slices.
1718
- `merge_sort!` (in-place), `merge_sort` (out-of-place) - sort arbitrary objects with custom comparisons.
1819
- `merge_sort_by_key!`, `merge_sort_by_key` - sort a vector of keys along with a "payload", a vector of corresponding values.
1920
- `merge_sortperm!`, `merge_sortperm`, `merge_sortperm_lowmem!`, `merge_sortperm_lowmem` - compute a sorting index permutation.
2021

2122
Function signatures:
2223
```@docs
24+
AcceleratedKernels.sample_sort!
25+
AcceleratedKernels.sample_sortperm!
2326
AcceleratedKernels.merge_sort!
2427
AcceleratedKernels.merge_sort
2528
AcceleratedKernels.merge_sort_by_key!

docs/src/api/task_partition.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
```@docs
44
AcceleratedKernels.TaskPartitioner
55
AcceleratedKernels.task_partition
6+
AcceleratedKernels.itask_partition
67
```

docs/src/assets/banner.png

-105 KB
Binary file not shown.

docs/src/assets/logo.png

535 KB
Loading
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[deps]
2+
AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
3+
AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"
4+
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
5+
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
6+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
7+
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
8+
PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
9+
Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
10+
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
11+
SyncBarriers = "3986aa12-c984-439b-887a-f8545bea0e93"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
import AcceleratedKernels as AK
3+
using Random
4+
Random.seed!(0)
5+
6+
v = rand(1:100, 1_000_000)
7+
AK.sort!(v)
8+
@assert issorted(v)
9+
10+
v = rand(1:100, 1_000_000)
11+
ix = AK.sortperm(v)
12+
@assert issorted(v[ix])
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
using BenchmarkTools
2+
3+
4+
# @check_allocs ignore_throw=false
5+
function sample_sort_histogram!(
6+
v::AbstractVector{T},
7+
splitters::Vector{T},
8+
histograms::Matrix{Int},
9+
itask, irange,
10+
) where T
11+
12+
@inbounds begin
13+
14+
# Compute the bucket histograms for this task
15+
for i in irange
16+
17+
# Find the bucket for this element
18+
ibucket = 1 + searchsortedlast(splitters, v[i])
19+
20+
# Increment the histogram for this task
21+
histograms[ibucket, itask] += 1
22+
end
23+
end
24+
25+
nothing
26+
end
27+
28+
29+
function sample_sort_parallel!(v, splitters, histograms, max_tasks)
30+
# Compute the histogram for each task - i.e. the number of elements in each bucket
31+
tasks = Vector{Task}(undef, max_tasks)
32+
for itask in 1:max_tasks
33+
irange = div((itask - 1) * length(v), max_tasks) + 1 : div(itask * length(v), max_tasks)
34+
# @show irange
35+
tasks[itask] = Threads.@spawn sample_sort_histogram!(
36+
v,
37+
splitters, histograms,
38+
itask, irange,
39+
)
40+
end
41+
42+
# Wait for all tasks to finish
43+
for itask in 1:max_tasks
44+
wait(tasks[itask])
45+
end
46+
47+
nothing
48+
end
49+
50+
51+
function sample_sort!(
52+
v;
53+
max_tasks=Threads.nthreads(),
54+
)
55+
splitters = Vector(range(0, 1, length=max_tasks + 1)[2:end-1])
56+
histograms = zeros(Int, max_tasks + 8, max_tasks) # padding to avoid false sharing
57+
sample_sort_parallel!(v, splitters, histograms, max_tasks)
58+
end
59+
60+
61+
v = rand(1_000_000)
62+
63+
@benchmark sample_sort!(v)
64+
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
2+
using StaticArrays
3+
using SyncBarriers
4+
using BenchmarkTools
5+
import AcceleratedKernels as AK
6+
7+
8+
using AllocCheck
9+
10+
using Random
11+
Random.seed!(0)
12+
13+
14+
15+
16+
# @check_allocs ignore_throw=false
17+
function _sample_sort_histogram!(v, splitters, histograms, itask, irange)
18+
for i in irange
19+
ibucket = 1 + AK._searchsortedlast(splitters, v[i], 1, length(splitters), isless)
20+
histograms[ibucket, itask] += 1
21+
end
22+
nothing
23+
end
24+
25+
26+
# @check_allocs ignore_throw=false
27+
function _sample_sort_parallel!(
28+
v, dest, comp,
29+
splitters, histograms,
30+
max_tasks,
31+
)
32+
# Compute the histogram for each task
33+
AK.itask_partition(length(v), max_tasks, 1) do itask, irange
34+
_sample_sort_histogram!(
35+
v,
36+
splitters, histograms,
37+
itask, irange,
38+
)
39+
end
40+
nothing
41+
end
42+
43+
44+
45+
function sample_sort!(
46+
v;
47+
max_tasks=Threads.nthreads(),
48+
49+
lt=isless,
50+
by=identity,
51+
rev::Union{Bool, Nothing}=nothing,
52+
order::Base.Order.Ordering=Base.Order.Forward,
53+
54+
temp=nothing
55+
)
56+
57+
oversampling_factor = 4
58+
num_elements = length(v)
59+
60+
if num_elements < 2
61+
return v
62+
end
63+
64+
if max_tasks == 1 || num_elements < oversampling_factor * max_tasks
65+
return sort!(v, lt=lt, by=by, rev=rev, order=order)
66+
end
67+
68+
# Create a temporary buffer for the sorted output
69+
if temp === nothing
70+
dest = similar(v)
71+
else
72+
# TODO add checks
73+
dest = temp
74+
end
75+
76+
# Construct comparator
77+
ord = Base.Order.ord(lt, by, rev, order)
78+
comp = (x, y) -> Base.Order.lt(ord, x, y)
79+
80+
# Take equally spaced samples, save them in dest
81+
num_samples = oversampling_factor * max_tasks
82+
isamples = IntLinSpace(1, num_elements, num_samples)
83+
@inbounds for i in 1:num_samples
84+
dest[i] = v[isamples[i]]
85+
end
86+
87+
# Sort samples and choose splitters
88+
sort!(view(dest, 1:num_samples), lt=lt, by=by, rev=rev, order=order)
89+
splitters = Vector{eltype(v)}(undef, max_tasks - 1)
90+
for i in 1:(max_tasks - 1)
91+
splitters[i] = dest[div(i * num_samples, max_tasks)]
92+
end
93+
94+
# Pre-allocate histogram for each task; each column is exclusive to the task
95+
histograms = zeros(Int, max_tasks + 8, max_tasks) # Add padding to avoid false sharing
96+
97+
# Run threaded region
98+
_sample_sort_parallel!(
99+
v, dest, comp,
100+
splitters, histograms,
101+
max_tasks,
102+
)
103+
104+
dest
105+
end
106+
107+
108+
109+
110+
111+
# Utilities
112+
113+
114+
# Create an integer linear space between start and stop on demand
115+
struct IntLinSpace{T <: Integer}
116+
start::T
117+
stop::T
118+
length::T
119+
end
120+
121+
function IntLinSpace(start::Integer, stop::Integer, length::Integer)
122+
start <= stop || throw(ArgumentError("`start` must be <= `stop`"))
123+
length >= 2 || throw(ArgumentError("`length` must be >= 2"))
124+
125+
IntLinSpace{typeof(start)}(start, stop, length)
126+
end
127+
128+
Base.IndexStyle(::IntLinSpace) = IndexLinear()
129+
Base.length(ils::IntLinSpace) = ils.length
130+
131+
Base.firstindex(::IntLinSpace) = 1
132+
Base.lastindex(ils::IntLinSpace) = ils.length
133+
134+
function Base.getindex(ils::IntLinSpace, i)
135+
@boundscheck 1 <= i <= ils.length || throw(BoundsError(ils, i))
136+
137+
if i == 1
138+
ils.start
139+
elseif i == length
140+
ils.stop
141+
else
142+
ils.start + div((i - 1) * (ils.stop - ils.start), ils.length - 1, RoundUp)
143+
end
144+
end
145+
146+
147+
148+
149+
150+
151+
152+
153+
v = rand(Float32, 100_000)
154+
155+
try
156+
temp = sample_sort!(v)
157+
catch e
158+
display(e.errors[1])
159+
rethrow(e)
160+
end
161+
162+
163+
t = @timed sample_sort!(v)
164+
165+
166+
# @assert issorted(temp)
167+
# println("sorted")
168+
169+
170+
# display(@benchmark sort!(v) setup=(v=rand(Float64, 10_000_000)))
171+
display(@benchmark sample_sort!(v) setup=(v=rand(Float64, 100_000)))

0 commit comments

Comments
 (0)