Skip to content

Commit 4cf8253

Browse files
committed
Copy-paste CUDA benchmarks
1 parent d6ca5c2 commit 4cf8253

File tree

10 files changed

+726
-0
lines changed

10 files changed

+726
-0
lines changed

perf/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
results.json
2+
reference.json

perf/Project.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[deps]
2+
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
3+
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
4+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
5+
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
6+
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

perf/array.jl

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
group = addgroup!(SUITE, "array")
2+
3+
const m = 512
4+
const n = 1000
5+
6+
# generate some arrays
7+
cpu_mat = rand(rng, Float32, m, n)
8+
gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
9+
gpu_vec = reshape(gpu_mat, length(gpu_mat))
10+
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
11+
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
12+
gpu_mat_ints = CuArray(rand(rng, Int, m, n))
13+
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
14+
gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
15+
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
16+
17+
group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
18+
19+
group["copy"] = @async_benchmarkable copy($gpu_mat)
20+
21+
gpu_mat2 = copy(gpu_mat)
22+
let group = addgroup!(group, "copyto!")
23+
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
24+
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
25+
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
26+
end
27+
28+
let group = addgroup!(group, "iteration")
29+
group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
30+
31+
group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
32+
33+
let group = addgroup!(group, "findall")
34+
group["bool"] = @benchmarkable findall($gpu_vec_bools)
35+
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
36+
end
37+
38+
let group = addgroup!(group, "findfirst")
39+
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
40+
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
41+
end
42+
43+
let group = addgroup!(group, "findmin") # findmax
44+
group["1d"] = @async_benchmarkable findmin($gpu_vec)
45+
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
46+
end
47+
end
48+
49+
let group = addgroup!(group, "reverse")
50+
group["1d"] = @async_benchmarkable reverse($gpu_vec)
51+
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
52+
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
53+
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
54+
end
55+
56+
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
57+
58+
# no need to test inplace version, which performs the same operation (but with an alloc)
59+
let group = addgroup!(group, "accumulate")
60+
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
61+
group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
62+
end
63+
64+
let group = addgroup!(group, "reductions")
65+
let group = addgroup!(group, "reduce")
66+
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
67+
group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
68+
end
69+
70+
let group = addgroup!(group, "mapreduce")
71+
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
72+
group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
73+
end
74+
75+
# used by sum, prod, minimum, maximum, all, any, count
76+
end
77+
78+
let group = addgroup!(group, "random")
79+
let group = addgroup!(group, "rand")
80+
group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
81+
group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
82+
end
83+
84+
let group = addgroup!(group, "rand!")
85+
group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
86+
group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
87+
end
88+
89+
let group = addgroup!(group, "randn")
90+
group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
91+
#group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
92+
end
93+
94+
let group = addgroup!(group, "randn!")
95+
group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
96+
#group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
97+
end
98+
end
99+
100+
let group = addgroup!(group, "sorting")
101+
group["1d"] = @async_benchmarkable sort($gpu_vec)
102+
group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
103+
group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
104+
end
105+
106+
let group = addgroup!(group, "permutedims")
107+
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
108+
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
109+
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
110+
end

perf/byval.jl

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
module ByVal
2+
3+
using CUDA, BenchmarkTools, Random
4+
using CUDA: i32
5+
6+
const threads = 256
7+
8+
# simple add matrixes kernel
9+
function kernel_add_mat(n, x1, x2, y)
10+
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
11+
if i <= n
12+
@inbounds y[i] = x1[i] + x2[i]
13+
end
14+
return
15+
end
16+
17+
@inline get_inputs3(indx_y, a, b, c) = (a, b, c)
18+
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
19+
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
20+
21+
# add arrays of matrixes kernel
22+
function kernel_add_mat_z_slices(n, vararg...)
23+
x1, x2, y = get_inputs3(blockIdx().y, vararg...)
24+
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
25+
if i <= n
26+
@inbounds y[i] = x1[i] + x2[i]
27+
end
28+
return
29+
end
30+
31+
function add_z_slices!(y, x1, x2)
32+
m1, n1 = size(x1[1]) #get size of first slice
33+
blocks = (m1 * n1 + threads - 1) ÷ threads
34+
# get length(x1) more blocks than needed to process 1 slice
35+
@cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
36+
end
37+
38+
function add!(y, x1, x2)
39+
m1, n1 = size(x1)
40+
blocks = (m1 * n1 + threads - 1) ÷ threads
41+
@cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
42+
end
43+
44+
function main()
45+
results = BenchmarkGroup()
46+
47+
num_z_slices = 3
48+
Random.seed!(1)
49+
50+
#m, n = 7, 5 # tiny to measure overhead
51+
#m, n = 521, 111
52+
#m, n = 1521, 1111
53+
#m, n = 3001, 1511 # prime numbers to test memory access correctness
54+
m, n = 3072, 1536 # 256 multiplier
55+
#m, n = 6007, 3001 # prime numbers to test memory access correctness
56+
57+
x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
58+
x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
59+
y1 = [similar(x1[1]) for i = 1:num_z_slices]
60+
61+
# reference down to bones add on GPU
62+
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
63+
64+
# adding arrays in an array
65+
for slices = 1:num_z_slices
66+
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
67+
end
68+
69+
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
70+
CUDA.unsafe_free!.(x1)
71+
CUDA.unsafe_free!.(x2)
72+
CUDA.unsafe_free!.(y1)
73+
74+
return results
75+
end
76+
77+
end
78+
79+
ByVal.main()

perf/kernel.jl

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using CUDA: i32
2+
3+
group = addgroup!(SUITE, "kernel")
4+
5+
group["launch"] = @benchmarkable @cuda identity(nothing)
6+
7+
group["occupancy"] = @benchmarkable begin
8+
kernel = @cuda launch=false identity(nothing)
9+
launch_configuration(kernel.fun)
10+
end
11+
12+
src = CUDA.rand(Float32, 512, 1000)
13+
dest = similar(src)
14+
function indexing_kernel(dest, src)
15+
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
16+
@inbounds dest[i] = src[i]
17+
return
18+
end
19+
group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
20+
21+
function checked_indexing_kernel(dest, src)
22+
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
23+
dest[i] = src[i]
24+
return
25+
end
26+
group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
27+
28+
function rand_kernel(dest::AbstractArray{T}) where {T}
29+
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
30+
dest[i] = rand(T)
31+
return
32+
end
33+
group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)

perf/latency.jl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
module Latency
2+
3+
using CUDA
4+
using BenchmarkTools
5+
6+
function main()
7+
results = BenchmarkGroup()
8+
9+
base_cmd = Base.julia_cmd()
10+
if Base.JLOptions().project != C_NULL
11+
base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
12+
end
13+
# NOTE: we don't ust Base.active_project() here because of how CI launches this script,
14+
# starting with --project in the main CUDA.jl project.
15+
16+
# time to precompile the package and its dependencies
17+
precompile_cmd =
18+
`$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
19+
Base.compilecache(pkg)"`
20+
results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
21+
22+
# time to actually import the package
23+
import_cmd =
24+
`$base_cmd -e "using CUDA"`
25+
results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
26+
27+
# time to actually compile a kernel
28+
ttfp_cmd =
29+
`$base_cmd -e "using CUDA
30+
kernel() = return
31+
CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
32+
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
33+
34+
results
35+
end
36+
37+
end
38+
39+
Latency.main()

perf/metal.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
group = addgroup!(SUITE, "cuda")
2+
3+
let group = addgroup!(group, "synchronization")
4+
let group = addgroup!(group, "stream")
5+
group["blocking"] = @benchmarkable synchronize(blocking=true)
6+
group["auto"] = @benchmarkable synchronize()
7+
group["nonblocking"] = @benchmarkable synchronize(spin=false)
8+
end
9+
let group = addgroup!(group, "context")
10+
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
11+
group["auto"] = @benchmarkable device_synchronize()
12+
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
13+
end
14+
end

perf/metaldevrt.jl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
module cudadevrt
2+
3+
using CUDA, BenchmarkTools, Random
4+
5+
const threads = 256
6+
#simple add matrix and vector kernel
7+
function kernel_add_mat_vec(m, x1, x2, y)
8+
# one block per column
9+
offset = (blockIdx().x-1) * m
10+
@inbounds xtmp = x2[blockIdx().x]
11+
for i = threadIdx().x : blockDim().x : m
12+
@inbounds y[offset + i] = x1[offset + i] + xtmp
13+
end
14+
return
15+
end
16+
17+
function add!(y, x1, x2)
18+
m, n = size(x1)
19+
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
20+
end
21+
22+
function main()
23+
Random.seed!(1)
24+
m, n = 3072, 1536 # 256 multiplier
25+
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
26+
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
27+
y1 = similar(x1)
28+
29+
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
30+
31+
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
32+
CUDA.unsafe_free!(x1)
33+
CUDA.unsafe_free!(x2)
34+
CUDA.unsafe_free!(y1)
35+
36+
return results
37+
end
38+
39+
end
40+
41+
cudadevrt.main()
42+

0 commit comments

Comments
 (0)