2121
2222@kernel function simple_transpose_kernel! (output, @Const (input))
2323 I, J = @index (Global, NTuple)
24- @inbounds output[I, J ] = input[I, J]
24+ @inbounds output[J, I ] = input[I, J]
2525end
2626
2727# Local memory variants
@@ -141,8 +141,10 @@ for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TI
141141 output = similar (input)
142142
143143 # compile kernel
144- kernel (input, output , ndrange= size (output))
144+ kernel (output, input , ndrange= size (output))
145145 for rep in 1 : nreps
146+ kernel (output, input, ndrange= size (output))
147+ end
146148 KernelAbstractions. synchronize (backend)
147149 end
148150 end
@@ -159,9 +161,9 @@ for (name, kernel) in (
159161 output = similar (input)
160162
161163 # compile kernel
162- kernel (input, output , Val (Int (bank)), ndrange= size (output))
164+ kernel (output, input , Val (Int (bank)), ndrange= size (output))
163165 for rep in 1 : nreps
164- kernel (input, output , Val (Int (bank)), ndrange= size (output))
166+ kernel (output, input , Val (Int (bank)), ndrange= size (output))
165167 end
166168 KernelAbstractions. synchronize (backend)
167169 end
@@ -185,9 +187,9 @@ for (name, kernel) in (
185187 ndrange = (N, div (N, block_factor))
186188
187189 # compile kernel
188- kernel (input, output , Val (Int (bank)), ndrange= ndrange)
190+ kernel (output, input , Val (Int (bank)), ndrange= ndrange)
189191 for rep in 1 : nreps
190- kernel (input, output , Val (Int (bank)), ndrange= ndrange)
192+ kernel (output, input , Val (Int (bank)), ndrange= ndrange)
191193 end
192194 KernelAbstractions. synchronize (backend)
193195 end
0 commit comments