reduced bank conflicts for fp16 and bf16

bssrdf · bssrdf · commit 3b8100c7bff1 · 2025-11-04T09:45:35.000-05:00
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
@@ -56,7 +56,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
     const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
     const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
 
-    __shared__ T tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D];
+    __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D];
 
 #pragma unroll
     for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
@@ -69,8 +69,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
         for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
             if(x < ne01 && y + j < ne00){
                 const int row = threadIdx.y+j;
-                const int col = threadIdx.x ^ row;  //swizzling to avoid bank conflicts
-                tile[row][col] = src[imat*n + (y+j)*ne01 + x];
+                const int col = (threadIdx.x*sizeof(float)/sizeof(T)) ^ row;  //swizzling to avoid bank conflicts
+                T *tile2 = reinterpret_cast<T*>(tile[row]);
+                tile2[col] = src[imat*n + (y+j)*ne01 + x];
             }
         }
 
@@ -79,8 +80,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int
 #pragma unroll
         for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
             if (ty + j < ne01 && tx < ne00) {
-                const int col = (threadIdx.y+j) ^ threadIdx.x; //swizzling to avoid bank conflicts
-                dst[imat*n + (ty+j)*ne00 + tx] = tile[threadIdx.x][col];
+                const int col = ((threadIdx.y+j)*sizeof(float)/sizeof(T)) ^ threadIdx.x; //swizzling to avoid bank conflicts
+                T *tile2 = reinterpret_cast<T*>(tile[threadIdx.x]);
+                dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int`
`56`	`56`	`const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset`
`57`	`57`	`const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;`
`58`	`58`
`59`		`- __shared__ T tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D];`
	`59`	`+ __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D];`
`60`	`60`
`61`	`61`	`#pragma unroll`
`62`	`62`	`for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {`
`@@ -69,8 +69,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int`
`69`	`69`	`for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {`
`70`	`70`	`if(x < ne01 && y + j < ne00){`
`71`	`71`	`const int row = threadIdx.y+j;`
`72`		`- const int col = threadIdx.x ^ row; //swizzling to avoid bank conflicts`
`73`		`- tile[row][col] = src[imatn + (y+j)ne01 + x];`
	`72`	`+ const int col = (threadIdx.x*sizeof(float)/sizeof(T)) ^ row; //swizzling to avoid bank conflicts`
	`73`	`+ T tile2 = reinterpret_cast<T>(tile[row]);`
	`74`	`+ tile2[col] = src[imatn + (y+j)ne01 + x];`
`74`	`75`	`}`
`75`	`76`	`}`
`76`	`77`
`@@ -79,8 +80,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int`
`79`	`80`	`#pragma unroll`
`80`	`81`	`for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {`
`81`	`82`	`if (ty + j < ne01 && tx < ne00) {`
`82`		`- const int col = (threadIdx.y+j) ^ threadIdx.x; //swizzling to avoid bank conflicts`
`83`		`- dst[imatn + (ty+j)ne00 + tx] = tile[threadIdx.x][col];`
	`83`	`+ const int col = ((threadIdx.y+j)*sizeof(float)/sizeof(T)) ^ threadIdx.x; //swizzling to avoid bank conflicts`
	`84`	`+ T tile2 = reinterpret_cast<T>(tile[threadIdx.x]);`
	`85`	`+ dst[imatn + (ty+j)ne00 + tx] = tile2[col];`
`84`	`86`	`}`
`85`	`87`	`}`
`86`	`88`	`}`