Fix #772 (#790)

ikawrakow · Iwan Kawrakow · web-flow · commit 45afaf33916a · 2025-09-23T16:43:02.000+02:00
Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1408,7 +1408,7 @@ void launch_fattn_mma(
 
         //const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
         //  On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
-        const bool use_stream_k = tiles_efficiency_percent < 75;
+        const bool use_stream_k = tiles_efficiency_percent < 75 || Q->ne[1] > 2048;
 
         blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
         blocks_num.y = 1;