fix AMD compilation

JohannesGaessler · JohannesGaessler · commit 69647be0b123 · 2025-05-16T10:37:17.000+02:00
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -198,6 +198,8 @@ static __global__ void flash_attn_vec_ext_f16(
 
             // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
             // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
             bool skip = true;
 #pragma unroll
             for (int j = 0; j < ncols; ++j) {
@@ -212,6 +214,7 @@ static __global__ void flash_attn_vec_ext_f16(
             if (__all_sync(0xFFFFFFFF, skip)) {
                 continue;
             }
+#endif // GGML_USE_HIP
         }
 
         // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -204,6 +204,8 @@ static __global__ void flash_attn_vec_ext_f32(
 
             // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
             // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
             bool skip = true;
 #pragma unroll
             for (int j = 0; j < ncols; ++j) {
@@ -217,6 +219,7 @@ static __global__ void flash_attn_vec_ext_f32(
             if (__all_sync(0xFFFFFFFF, skip)) {
                 continue;
             }
+#endif // GGML_USE_HIP
         }
 
         float kqmax_new_arr[ncols];

Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,8 @@ static __global__ void flash_attn_vec_ext_f16(`
`198`	`198`
`199`	`199`	`// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.`
`200`	`200`	`// In such cases, skip the KV slice.`
	`201`	`+ // On AMD __all_sync would not work correctly because it assumes a warp size of 64.`
	`202`	`+#ifndef GGML_USE_HIP`
`201`	`203`	`bool skip = true;`
`202`	`204`	`#pragma unroll`
`203`	`205`	`for (int j = 0; j < ncols; ++j) {`
`@@ -212,6 +214,7 @@ static __global__ void flash_attn_vec_ext_f16(`
`212`	`214`	`if (__all_sync(0xFFFFFFFF, skip)) {`
`213`	`215`	`continue;`
`214`	`216`	`}`
	`217`	`+#endif // GGML_USE_HIP`
`215`	`218`	`}`
`216`	`219`
`217`	`220`	`// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,`
Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,8 @@ static __global__ void flash_attn_vec_ext_f32(`
`204`	`204`
`205`	`205`	`// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.`
`206`	`206`	`// In such cases, skip the KV slice.`
	`207`	`+ // On AMD __all_sync would not work correctly because it assumes a warp size of 64.`
	`208`	`+#ifndef GGML_USE_HIP`
`207`	`209`	`bool skip = true;`
`208`	`210`	`#pragma unroll`
`209`	`211`	`for (int j = 0; j < ncols; ++j) {`
`@@ -217,6 +219,7 @@ static __global__ void flash_attn_vec_ext_f32(`
`217`	`219`	`if (__all_sync(0xFFFFFFFF, skip)) {`
`218`	`220`	`continue;`
`219`	`221`	`}`
	`222`	`+#endif // GGML_USE_HIP`
`220`	`223`	`}`
`221`	`224`
`222`	`225`	`float kqmax_new_arr[ncols];`