CUDA: fix FA VKQ accumulator overflow (#17746)

JohannesGaessler · web-flow · commit e95d0bc8fdb4 · 2025-12-05T09:18:10.000+01:00
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -10,6 +10,12 @@
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
 
+// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
+//     by the VKQ accumulators is effectively being shifted up by a factor of 8.
+// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
+// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
+#define FATTN_KQ_MAX_OFFSET 0.6931f
+
 typedef void (* fattn_kernel_t)(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -532,7 +532,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
             for (int l = 0; l < T_C_KQ::ne; ++l) {
                 if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
-                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l]);
+                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
                 }
             }
         }
@@ -585,7 +585,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
             for (int l = 0; l < T_C_KQ::ne; ++l) {
                 if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
                     // Turing + Volta:
-                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l]);
+                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
                 }
             }
         }
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
                 KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] += (ncols2 > 1 || mask) ?
                     slope*__half2float(mask[j*stride_mask + k_VKQ_0 + i_KQ]) : 0.0f;
 
-                KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
+                KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] + FATTN_KQ_MAX_OFFSET);
             }
         }
 
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -270,7 +270,7 @@ static __global__ void flash_attn_ext_vec(
                     sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
                 }
 
-                KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
+                KQ_max_new[j] = fmaxf(KQ_max_new[j], sum + FATTN_KQ_MAX_OFFSET);
 
                 if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {
                     KQ_reg[j] = sum;
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -220,7 +220,7 @@ static __global__ void flash_attn_ext_f16(
 
                     KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?
                         __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
-                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
+                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size] + FATTN_KQ_MAX_OFFSET);
                 }
                 KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
 

Original file line number	Diff line number	Diff line change
`@@ -532,7 +532,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(`
`532`	`532`	`#pragma unroll`
`533`	`533`	`for (int l = 0; l < T_C_KQ::ne; ++l) {`
`534`	`534`	`if (!oob_check \|\| k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {`
`535`		`- KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l]);`
	`535`	`+ KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);`
`536`	`536`	`}`
`537`	`537`	`}`
`538`	`538`	`}`
`@@ -585,7 +585,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(`
`585`	`585`	`for (int l = 0; l < T_C_KQ::ne; ++l) {`
`586`	`586`	`if (!oob_check \|\| k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {`
`587`	`587`	`// Turing + Volta:`
`588`		`- KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l]);`
	`588`	`+ KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);`
`589`	`589`	`}`
`590`	`590`	`}`
`591`	`591`	`}`
Original file line number	Diff line number	Diff line change
`@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(`
`572`	`572`	`KQ_acc[(i_KQ_0/(npwarp_size))cpw + jc0] += (ncols2 > 1 \|\| mask) ?`
`573`	`573`	`slope__half2float(mask[jstride_mask + k_VKQ_0 + i_KQ]) : 0.0f;`
`574`	`574`
`575`		`- KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(npwarp_size))cpw + jc0]);`
	`575`	`+ KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(npwarp_size))cpw + jc0] + FATTN_KQ_MAX_OFFSET);`
`576`	`576`	`}`
`577`	`577`	`}`
`578`	`578`
Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,7 @@ static __global__ void flash_attn_ext_vec(`
`270`	`270`	`sum += slope__half2float(maskh[jne11 + i_KQ]);`
`271`	`271`	`}`
`272`	`272`
`273`		`- KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);`
	`273`	`+ KQ_max_new[j] = fmaxf(KQ_max_new[j], sum + FATTN_KQ_MAX_OFFSET);`
`274`	`274`
`275`	`275`	`if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {`
`276`	`276`	`KQ_reg[j] = sum;`
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ static __global__ void flash_attn_ext_f16(`
`220`	`220`
`221`	`221`	`KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?`
`222`	`222`	`__half2float(slopehmaskh[j(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;`
`223`		`- KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);`
	`223`	`+ KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size] + FATTN_KQ_MAX_OFFSET);`
`224`	`224`	`}`
`225`	`225`	`KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);`
`226`	`226`