We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 611f419 commit 710dfc4Copy full SHA for 710dfc4
ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -258,7 +258,7 @@ static __global__ void flash_attn_tile_ext_f16(
258
const half val = hexp(sink - kqmax[j0/nwarps]);
259
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
260
if (threadIdx.x == 0) {
261
- kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val);
+ kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
262
}
263
264
#pragma unroll
0 commit comments