Skip to content

Commit 2eb5117

Browse files
author
bssrdf
committed
now bank conflicts free
1 parent 51a2590 commit 2eb5117

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
6969
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
7070
if(x < ne01 && y + j < ne00){
7171
const int row = threadIdx.y+j;
72-
const int col = (threadIdx.x*sizeof(float)/sizeof(T)) ^ row; //swizzling to avoid bank conflicts
72+
const int col = (threadIdx.x ^ row)*sizeof(float)/sizeof(T); //swizzling to avoid bank conflicts
7373
T *tile2 = reinterpret_cast<T*>(tile[row]);
7474
tile2[col] = src[imat*n + (y+j)*ne01 + x];
7575
}
@@ -80,7 +80,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
8080
#pragma unroll
8181
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
8282
if (ty + j < ne01 && tx < ne00) {
83-
const int col = ((threadIdx.y+j)*sizeof(float)/sizeof(T)) ^ threadIdx.x; //swizzling to avoid bank conflicts
83+
const int col = ((threadIdx.y+j) ^ threadIdx.x)*sizeof(float)/sizeof(T); //swizzling to avoid bank conflicts
8484
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
8585
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
8686
}

0 commit comments

Comments
 (0)