fix excessive KQ_b loads

JohannesGaessler · ggerganov · commit bb0d51accd7e · 2024-04-02T13:48:13.000+03:00
diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu
@@ -387,12 +387,16 @@ static __global__ void flash_attn_ext_f16(
 
         __syncthreads();
 
-        frag_b KQ_b[FATTN_KQ_STRIDE/16][ncols/frag_n];
+        frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];
 #pragma unroll
         for (int j0 = 0; j0 < ncols; j0 += frag_n) {
 #pragma unroll
-            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += 16) {
-                nvcuda::wmma::load_matrix_sync(KQ_b[k0/16][j0/frag_n], KQ + j0*kqs_padded + k0, kqs_padded);
+            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
+                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
+                nvcuda::wmma::load_matrix_sync(
+                    KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
+                    KQ + j0*kqs_padded + k,
+                    kqs_padded);
             }
         }
 
@@ -412,7 +416,7 @@ static __global__ void flash_attn_ext_f16(
                 nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
 #pragma unroll
                 for (int j = 0; j < ncols/frag_n; ++j) {
-                    nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k/16][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
+                    nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
                 }
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -387,12 +387,16 @@ static __global__ void flash_attn_ext_f16(`
`387`	`387`
`388`	`388`	`__syncthreads();`
`389`	`389`
`390`		`- frag_b KQ_b[FATTN_KQ_STRIDE/16][ncols/frag_n];`
	`390`	`+ frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];`
`391`	`391`	`#pragma unroll`
`392`	`392`	`for (int j0 = 0; j0 < ncols; j0 += frag_n) {`
`393`	`393`	`#pragma unroll`
`394`		`- for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += 16) {`
`395`		`- nvcuda::wmma::load_matrix_sync(KQ_b[k0/16][j0/frag_n], KQ + j0*kqs_padded + k0, kqs_padded);`
	`394`	`+ for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {`
	`395`	`+ const int k = k0 + (threadIdx.y % VKQ_ratio)*16;`
	`396`	`+ nvcuda::wmma::load_matrix_sync(`
	`397`	`+ KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],`
	`398`	`+ KQ + j0*kqs_padded + k,`
	`399`	`+ kqs_padded);`
`396`	`400`	`}`
`397`	`401`	`}`
`398`	`402`
`@@ -412,7 +416,7 @@ static __global__ void flash_attn_ext_f16(`
`412`	`416`	`nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)stride_KV + i_VKQ_0 + frag_m(threadIdx.y/VKQ_ratio), stride_KV);`
`413`	`417`	`#pragma unroll`
`414`	`418`	`for (int j = 0; j < ncols/frag_n; ++j) {`
`415`		`- nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k/16][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);`
	`419`	`+ nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);`
`416`	`420`	`}`
`417`	`421`	`}`
`418`	`422`	`}`