From 8aa6cc41e9637649b16d29b2a0f77ce44e839e5b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 1 Oct 2025 07:26:16 -0700 Subject: [PATCH] revert max split heuristics Signed-off-by: Lucas Wilkinson --- csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp index 297d94dcc063..77e1c9351f97 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp @@ -134,13 +134,6 @@ class MLA { int max_splits = ceil_div(K, 128); max_splits = min(16, max_splits); - // TODO: This avoids a hang when the batch size larger than 1 and - // there is more than 1 kv_splits. - // Discuss with NVIDIA how this can be fixed. - if (B > 1) { - max_splits = min(1, max_splits); - } - // printf(" max_splits = %d\n", max_splits); int sms_per_batch = max(1, sm_count / B); // printf(" sms_per_batch = %d\n", sms_per_batch);