21
21
22
22
@kernel function simple_transpose_kernel! (output, @Const (input))
23
23
I, J = @index (Global, NTuple)
24
- @inbounds output[I, J ] = input[I, J]
24
+ @inbounds output[J, I ] = input[I, J]
25
25
end
26
26
27
27
# Local memory variants
@@ -141,8 +141,10 @@ for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TI
141
141
output = similar (input)
142
142
143
143
# compile kernel
144
- kernel (input, output , ndrange= size (output))
144
+ kernel (output, input , ndrange= size (output))
145
145
for rep in 1 : nreps
146
+ kernel (output, input, ndrange= size (output))
147
+ end
146
148
KernelAbstractions. synchronize (backend)
147
149
end
148
150
end
@@ -159,9 +161,9 @@ for (name, kernel) in (
159
161
output = similar (input)
160
162
161
163
# compile kernel
162
- kernel (input, output , Val (Int (bank)), ndrange= size (output))
164
+ kernel (output, input , Val (Int (bank)), ndrange= size (output))
163
165
for rep in 1 : nreps
164
- kernel (input, output , Val (Int (bank)), ndrange= size (output))
166
+ kernel (output, input , Val (Int (bank)), ndrange= size (output))
165
167
end
166
168
KernelAbstractions. synchronize (backend)
167
169
end
@@ -185,9 +187,9 @@ for (name, kernel) in (
185
187
ndrange = (N, div (N, block_factor))
186
188
187
189
# compile kernel
188
- kernel (input, output , Val (Int (bank)), ndrange= ndrange)
190
+ kernel (output, input , Val (Int (bank)), ndrange= ndrange)
189
191
for rep in 1 : nreps
190
- kernel (input, output , Val (Int (bank)), ndrange= ndrange)
192
+ kernel (output, input , Val (Int (bank)), ndrange= ndrange)
191
193
end
192
194
KernelAbstractions. synchronize (backend)
193
195
end
0 commit comments