File tree Expand file tree Collapse file tree 1 file changed +1
-1
lines changed Expand file tree Collapse file tree 1 file changed +1
-1
lines changed Original file line number Diff line number Diff line change @@ -252,7 +252,6 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
252252 const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
253253 const enum ggml_prec prec = ggml_flash_attn_ext_get_prec (KQV);
254254
255- // On AMD the tile kernels perform poorly, use the vec kernel instead:
256255 if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
257256#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
258257 if (fp16_mma_available (cc) && dst->src [0 ]->ne [1 ] > 8 ) {
@@ -261,6 +260,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
261260 }
262261#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
263262
263+ // On AMD the tile kernels perform poorly, use the vec kernel instead:
264264 if (prec == GGML_PREC_DEFAULT && fast_fp16_available (cc)) {
265265 ggml_cuda_flash_attn_ext_vec_f16 (ctx, dst);
266266 } else {
You can’t perform that action at this time.
0 commit comments