Skip to content

Commit fe652e1

Browse files
committed
add transpose performance measurements
1 parent ff71015 commit fe652e1

File tree

1 file changed

+111
-0
lines changed

1 file changed

+111
-0
lines changed

examples/performance.jl

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
using KernelAbstractions
2+
using CUDAapi
3+
4+
CUDAapi.has_cuda_gpu() || exit()
5+
6+
using CuArrays
7+
using CUDAdrv
8+
using CUDAnative
9+
using CUDAnative.NVTX
10+
11+
@kernel function transpose_kernel_naive!(b, a)
12+
I = @index(Global, Cartesian)
13+
i, j = I.I
14+
@inbounds b[i, j] = a[j, i]
15+
end
16+
17+
const block_dim = 32
18+
const grid_dim = 256
19+
20+
@kernel function transpose_kernel!(b, a)
21+
block_dim_x, block_dim_y = block_dim, block_dim
22+
grid_dim_x, grid_dim_y = grid_dim, grid_dim
23+
24+
wgsize = prod(groupsize())
25+
26+
I = @index(Global)
27+
L = @index(Local)
28+
G = div(I - 1, wgsize) + 1
29+
30+
thread_idx_x = (L - 1) % block_dim_x + 1
31+
thread_idx_y = div(L - 1, block_dim_x) + 1
32+
33+
block_idx_x = (G - 1) % grid_dim_x + 1
34+
block_idx_y = div(G - 1, grid_dim_x) + 1
35+
36+
i = (block_idx_x - 1) * block_dim_x + thread_idx_x
37+
j = (block_idx_y - 1) * block_dim_y + thread_idx_y
38+
39+
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
40+
end
41+
42+
const T = Float32
43+
const N = grid_dim * block_dim
44+
const shape = N, N
45+
const nreps = 10
46+
47+
NVTX.@range "Naive transpose $block_dim, $block_dim" let
48+
a = CuArray(rand(T, shape))
49+
b = similar(a, shape[2], shape[1])
50+
kernel! = transpose_kernel_naive!(CUDA(), (block_dim, block_dim), size(b))
51+
52+
event = kernel!(b, a)
53+
wait(event)
54+
@assert Array(b) == Array(a)'
55+
@CUDAdrv.profile begin
56+
for rep in 1:nreps
57+
event = kernel!(b, a, dependencies=(event,))
58+
end
59+
wait(event)
60+
end
61+
end
62+
63+
NVTX.@range "Naive transpose $(block_dim^2), 1" let
64+
a = CuArray(rand(T, shape))
65+
b = similar(a, shape[2], shape[1])
66+
kernel! = transpose_kernel_naive!(CUDA(), (block_dim*block_dim, 1), size(b))
67+
68+
event = kernel!(b, a)
69+
wait(event)
70+
@assert Array(b) == Array(a)'
71+
@CUDAdrv.profile begin
72+
for rep in 1:nreps
73+
event = kernel!(b, a, dependencies=(event,))
74+
end
75+
wait(event)
76+
end
77+
end
78+
79+
NVTX.@range "Naive transpose 1, $(block_dim^2)" let
80+
a = CuArray(rand(T, shape))
81+
b = similar(a, shape[2], shape[1])
82+
kernel! = transpose_kernel_naive!(CUDA(), (1, blockdim*block_dim), size(b))
83+
84+
event = kernel!(b, a)
85+
wait(event)
86+
@assert Array(b) == Array(a)'
87+
@CUDAdrv.profile begin
88+
for rep in 1:nreps
89+
event = kernel!(b, a, dependencies=(event,))
90+
end
91+
wait(event)
92+
end
93+
end
94+
95+
NVTX.@range "Baseline transpose" let
96+
a = CuArray(rand(T, shape))
97+
b = similar(a, shape[2], shape[1])
98+
99+
kernel! = transpose_kernel!(CUDA(), (block_dim*block_dim), length(b))
100+
101+
event = kernel!(b, a)
102+
wait(event)
103+
@assert Array(b) == Array(a)'
104+
@CUDAdrv.profile begin
105+
for rep in 1:nreps
106+
event = kernel!(b, a, dependencies=(event,))
107+
end
108+
wait(event)
109+
end
110+
end
111+

0 commit comments

Comments
 (0)