Copy-paste CUDA benchmarks

christiangnrd · christiangnrd · commit 4cf82538b5d4 · 2024-09-19T09:41:34.000-03:00
diff --git a/perf/.gitignore b/perf/.gitignore
@@ -0,0 +1,2 @@
+results.json
+reference.json
diff --git a/perf/Project.toml b/perf/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/array.jl b/perf/array.jl
@@ -0,0 +1,110 @@
+group = addgroup!(SUITE, "array")
+
+const m = 512
+const n = 1000
+
+# generate some arrays
+cpu_mat = rand(rng, Float32, m, n)
+gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
+gpu_vec = reshape(gpu_mat, length(gpu_mat))
+gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+gpu_mat_ints = CuArray(rand(rng, Int, m, n))
+gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
+gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+
+group["copy"] = @async_benchmarkable copy($gpu_mat)
+
+gpu_mat2 = copy(gpu_mat)
+let group = addgroup!(group, "copyto!")
+    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+end
+
+let group = addgroup!(group, "iteration")
+    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+
+    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+
+    let group = addgroup!(group, "findall")
+        group["bool"] = @benchmarkable findall($gpu_vec_bools)
+        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findfirst")
+        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findmin") # findmax
+        group["1d"] = @async_benchmarkable findmin($gpu_vec)
+        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+    end
+end
+
+let group = addgroup!(group, "reverse")
+    group["1d"] = @async_benchmarkable reverse($gpu_vec)
+    group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+    group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+    group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+end
+
+group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# no need to test inplace version, which performs the same operation (but with an alloc)
+let group = addgroup!(group, "accumulate")
+    group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
+    group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+end
+
+let group = addgroup!(group, "reductions")
+    let group = addgroup!(group, "reduce")
+        group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
+        group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+    end
+
+    let group = addgroup!(group, "mapreduce")
+        group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+        group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+    end
+
+    # used by sum, prod, minimum, maximum, all, any, count
+end
+
+let group = addgroup!(group, "random")
+    let group = addgroup!(group, "rand")
+        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
+        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "rand!")
+        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
+        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "randn")
+        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
+        #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "randn!")
+        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
+        #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
+    end
+end
+
+let group = addgroup!(group, "sorting")
+    group["1d"] = @async_benchmarkable sort($gpu_vec)
+    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+end
+
+let group = addgroup!(group, "permutedims")
+    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+end
diff --git a/perf/byval.jl b/perf/byval.jl
@@ -0,0 +1,79 @@
+module ByVal
+
+using CUDA, BenchmarkTools, Random
+using CUDA: i32
+
+const threads = 256
+
+# simple add matrixes kernel
+function kernel_add_mat(n, x1, x2, y)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+@inline get_inputs3(indx_y, a, b, c)                            = (a, b, c)
+@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)             = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
+@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
+
+# add arrays of matrixes kernel
+function kernel_add_mat_z_slices(n, vararg...)
+    x1, x2, y = get_inputs3(blockIdx().y, vararg...)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+function add_z_slices!(y, x1, x2)
+    m1, n1 = size(x1[1]) #get size of first slice
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    # get length(x1) more blocks than needed to process 1 slice
+    @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+end
+
+function add!(y, x1, x2)
+    m1, n1 = size(x1)
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
+end
+
+function main()
+    results = BenchmarkGroup()
+
+    num_z_slices = 3
+    Random.seed!(1)
+
+    #m, n = 7, 5          # tiny to measure overhead
+    #m, n = 521, 111
+    #m, n = 1521, 1111
+    #m, n = 3001, 1511    # prime numbers to test memory access correctness
+    m, n = 3072, 1536    # 256 multiplier
+    #m, n = 6007, 3001    # prime numbers to test memory access correctness
+
+    x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    y1 = [similar(x1[1]) for i = 1:num_z_slices]
+
+    # reference down to bones add on GPU
+    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
+
+    # adding arrays in an array
+    for slices = 1:num_z_slices
+        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!.(x1)
+    CUDA.unsafe_free!.(x2)
+    CUDA.unsafe_free!.(y1)
+
+    return results
+end
+
+end
+
+ByVal.main()
diff --git a/perf/kernel.jl b/perf/kernel.jl
@@ -0,0 +1,33 @@
+using CUDA: i32
+
+group = addgroup!(SUITE, "kernel")
+
+group["launch"] = @benchmarkable @cuda identity(nothing)
+
+group["occupancy"] = @benchmarkable begin
+    kernel = @cuda launch=false identity(nothing)
+    launch_configuration(kernel.fun)
+end
+
+src = CUDA.rand(Float32, 512, 1000)
+dest = similar(src)
+function indexing_kernel(dest, src)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    @inbounds dest[i] = src[i]
+    return
+end
+group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
+
+function checked_indexing_kernel(dest, src)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    dest[i] = src[i]
+    return
+end
+group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
+
+function rand_kernel(dest::AbstractArray{T}) where {T}
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    dest[i] = rand(T)
+    return
+end
+group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
diff --git a/perf/latency.jl b/perf/latency.jl
@@ -0,0 +1,39 @@
+module Latency
+
+using CUDA
+using BenchmarkTools
+
+function main()
+    results = BenchmarkGroup()
+
+    base_cmd = Base.julia_cmd()
+    if Base.JLOptions().project != C_NULL
+        base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
+    end
+    # NOTE: we don't ust Base.active_project() here because of how CI launches this script,
+    #       starting with --project in the main CUDA.jl project.
+
+    # time to precompile the package and its dependencies
+    precompile_cmd =
+        `$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
+                       Base.compilecache(pkg)"`
+    results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
+
+    # time to actually import the package
+    import_cmd =
+        `$base_cmd -e "using CUDA"`
+    results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
+
+    # time to actually compile a kernel
+    ttfp_cmd =
+        `$base_cmd -e "using CUDA
+                       kernel() = return
+                       CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
+    results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
+
+    results
+end
+
+end
+
+Latency.main()
diff --git a/perf/metal.jl b/perf/metal.jl
@@ -0,0 +1,14 @@
+group = addgroup!(SUITE, "cuda")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"] = @benchmarkable synchronize(blocking=true)
+        group["auto"] = @benchmarkable synchronize()
+        group["nonblocking"] = @benchmarkable synchronize(spin=false)
+    end
+    let group = addgroup!(group, "context")
+        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
+        group["auto"] = @benchmarkable device_synchronize()
+        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
+    end
+end
diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl
@@ -0,0 +1,42 @@
+module cudadevrt
+
+using CUDA, BenchmarkTools, Random
+
+const threads = 256
+#simple add matrix and vector kernel
+function kernel_add_mat_vec(m, x1, x2, y)
+    # one block per column
+    offset = (blockIdx().x-1) * m
+    @inbounds xtmp = x2[blockIdx().x]
+    for i = threadIdx().x : blockDim().x : m
+        @inbounds y[offset + i] = x1[offset + i] + xtmp
+    end
+    return
+end
+
+function add!(y, x1, x2)
+    m, n = size(x1)
+    @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
+end
+
+function main()
+    Random.seed!(1)
+    m, n = 3072, 1536    # 256 multiplier
+    x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
+    x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
+    y1 = similar(x1)
+
+    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!(x1)
+    CUDA.unsafe_free!(x2)
+    CUDA.unsafe_free!(y1)
+
+    return results
+end
+
+end
+
+cudadevrt.main()
+
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl