1
1
function transpose_kernel! (
2
- state, At, A, width, height, A_local , :: Val{BLOCK }
3
- ) where BLOCK
2
+ state, At, A:: AbstractArray{T} , width, height, :: Val{BLOCK} , :: Val{LMem }
3
+ ) where { BLOCK, LMem, T}
4
4
5
5
ui1 = UInt32 (1 )
6
6
bidx_x = blockidx_x (state) - ui1
7
7
bidx_y = blockidx_y (state) - ui1
8
8
tidx_x = threadidx_x (state) - ui1
9
9
tidx_y = threadidx_y (state) - ui1
10
10
11
+ A_local = LocalMemory (state, T, LMem)
12
+
11
13
base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
12
14
base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
13
15
@@ -31,8 +33,8 @@ function Base.transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
31
33
dev = GPUArrays. device (A)
32
34
block_size = max_block_size (dev, size (A)... )
33
35
outsize = UInt32 .(size (At))
34
- lmem = GPUArrays . LocalMemory {T} ( block_size * (block_size + 1 ) )
35
- args = (At, A, outsize... , lmem , Val {block_size } ())
36
+ lmem = block_size * (block_size + 1 )
37
+ args = (At, A, outsize... , Val {block_size} () , Val {lmem } ())
36
38
gpu_call (transpose_kernel!, At, args, (block_size, block_size))
37
39
At
38
40
end
0 commit comments