Skip to content

Commit a34bb8d

Browse files
committed
cp change-block-thread
1 parent 3a74536 commit a34bb8d

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,7 +1457,7 @@ __host__ __device__ constexpr static U arrayConvert(T const& input) {
14571457
// (k-1)*rows_in_input all map to row 0 in the original matrix. Thus, to know where to read in the
14581458
// source matrix, we simply take the modulus of the expanded index.
14591459

1460-
constexpr static int EXPAND_THREADS_PER_BLOCK = 256;
1460+
constexpr static int EXPAND_THREADS_PER_BLOCK = 128;
14611461

14621462
template <class InputActivationsType, class ExpandedActivationsType,
14631463
TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType,
@@ -1697,7 +1697,7 @@ void expandInputRowsKernelLauncher(
16971697

16981698
static int64_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
16991699
// Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
1700-
int64_t const blocks = std::min(smCount * 8, std::max(num_rows * k, num_padding_tokens));
1700+
int64_t const blocks = std::min(smCount * 16, std::max(num_rows * k, num_padding_tokens));
17011701
int64_t const threads = EXPAND_THREADS_PER_BLOCK;
17021702

17031703
auto func = [&]() {

0 commit comments

Comments
 (0)