|
| 1 | +using LinearAlgebra, CUDA, Dagger, BenchmarkTools |
| 2 | + |
| 3 | +# SPD Matrix |
| 4 | +N = 10_000 |
| 5 | +A = randn(N, N) |
| 6 | +A = A*A' + N*I |
| 7 | + |
| 8 | +# Right-hand side |
| 9 | +b = randn(N) |
| 10 | + |
| 11 | + |
| 12 | +# Cholesky: CUDA ############################################################# |
| 13 | + |
| 14 | +# SPD Matrix and right-hand side on GPU (CUDA) |
| 15 | +A_cuda = CUDA.CuArray(A) |
| 16 | +b_cuda = CUDA.CuArray(b) |
| 17 | + |
| 18 | +# Warm-up |
| 19 | +x_cuda = cholesky(A_cuda) \ b_cuda |
| 20 | + |
| 21 | +# Benchmark time and memory |
| 22 | +@time cholesky(A_cuda) # 0.895192 seconds (660 allocations: 12.094 KiB) |
| 23 | +#@benchmark cholesky($A_cuda) |
| 24 | +@time cholesky(A_cuda) \ b_cuda # 0.882263 seconds (953 allocations: 16.844 KiB) |
| 25 | +#@benchmark cholesky($A_cuda) \ $b_cuda |
| 26 | + |
| 27 | +# Errors |
| 28 | +e_cuda = norm(A_cuda*x_cuda - b_cuda)/norm(b_cuda) |
| 29 | + |
| 30 | +# Free memory |
| 31 | +A_cuda = nothing |
| 32 | +b_cuda = nothing |
| 33 | +GC.gc() |
| 34 | +CUDA.reclaim() |
| 35 | + |
| 36 | + |
| 37 | +# Cholesky: Dagger ########################################################### |
| 38 | + |
| 39 | +# SPD Matrix and right-hand side on GPU (Dagger Distributed) |
| 40 | +A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 41 | + distribute(A, Blocks(N÷4, N÷4)) |
| 42 | +end |
| 43 | +b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 44 | + distribute(b, Blocks(N÷4)) |
| 45 | +end |
| 46 | + |
| 47 | +# Warm-up |
| 48 | +x_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 49 | + cholesky(A_d) |
| 50 | +end |
| 51 | +Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 52 | + cholesky(A_d) \ b_d |
| 53 | +end |
| 54 | + |
| 55 | +CUDA.reclaim() |
| 56 | + |
| 57 | +# Benchmark time and memory |
| 58 | +@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 59 | + cholesky(A_d) |
| 60 | +end # 1.039517 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation) |
| 61 | +@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 62 | + cholesky(A_d) \ b_d |
| 63 | +end # 1.046506 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation) |
| 64 | +# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 65 | +# cholesky($A_d) |
| 66 | +# end samples = 1 |
| 67 | +# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do |
| 68 | +# cholesky($A_d) \ $b_d |
| 69 | +# end samples = 1 |
| 70 | +# Free memory |
| 71 | + |
| 72 | +# Errors |
| 73 | +e_d = norm(A_d*x_d - b_d) |
| 74 | + |
| 75 | +# Free memory |
| 76 | +A_d = nothing |
| 77 | +b_d = nothing |
| 78 | +GC.gc() |
| 79 | + |
0 commit comments