We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 8cd2d7c commit 45afaf3Copy full SHA for 45afaf3
ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1408,7 +1408,7 @@ void launch_fattn_mma(
1408
1409
//const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
1410
// On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
1411
- const bool use_stream_k = tiles_efficiency_percent < 75;
+ const bool use_stream_k = tiles_efficiency_percent < 75 || Q->ne[1] > 2048;
1412
1413
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
1414
blocks_num.y = 1;
0 commit comments