@@ -56,7 +56,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
5656 const int tx = blockIdx .y * CUDA_CPY_TILE_DIM_2D + threadIdx .x ; // transpose block offset
5757 const int ty = blockIdx .x * CUDA_CPY_TILE_DIM_2D + threadIdx .y ;
5858
59- __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D];
59+ __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+ 1 ];
6060
6161#pragma unroll
6262 for (int i = 0 ; i < CUDA_CPY_BLOCK_NM; ++i) {
@@ -69,7 +69,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
6969 for (int j = 0 ; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
7070 if (x < ne01 && y + j < ne00){
7171 const int row = threadIdx .y +j;
72- const int col = ( threadIdx .x ^ row)* sizeof (float )/sizeof (T); // swizzling to avoid bank conflicts
72+ const int col = threadIdx .x * sizeof (float )/sizeof (T);
7373 T *tile2 = reinterpret_cast <T*>(tile[row]);
7474 tile2[col] = src[imat*n + (y+j)*ne01 + x];
7575 }
@@ -80,7 +80,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
8080#pragma unroll
8181 for (int j = 0 ; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
8282 if (ty + j < ne01 && tx < ne00) {
83- const int col = (( threadIdx .y +j) ^ threadIdx . x ) *sizeof (float )/sizeof (T); // swizzling to avoid bank conflicts
83+ const int col = (threadIdx .y +j)*sizeof (float )/sizeof (T);
8484 const T *tile2 = reinterpret_cast <const T*>(tile[threadIdx .x ]);
8585 dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
8686 }
0 commit comments