File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed
csrc/fused_moe/cutlass_backend Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -1457,7 +1457,7 @@ __host__ __device__ constexpr static U arrayConvert(T const& input) {
1457
1457
// (k-1)*rows_in_input all map to row 0 in the original matrix. Thus, to know where to read in the
1458
1458
// source matrix, we simply take the modulus of the expanded index.
1459
1459
1460
- constexpr static int EXPAND_THREADS_PER_BLOCK = 256 ;
1460
+ constexpr static int EXPAND_THREADS_PER_BLOCK = 128 ;
1461
1461
1462
1462
template <class InputActivationsType , class ExpandedActivationsType ,
1463
1463
TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType,
@@ -1697,7 +1697,7 @@ void expandInputRowsKernelLauncher(
1697
1697
1698
1698
static int64_t const smCount = tensorrt_llm::common::getMultiProcessorCount ();
1699
1699
// Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
1700
- int64_t const blocks = std::min (smCount * 8 , std::max (num_rows * k, num_padding_tokens));
1700
+ int64_t const blocks = std::min (smCount * 16 , std::max (num_rows * k, num_padding_tokens));
1701
1701
int64_t const threads = EXPAND_THREADS_PER_BLOCK;
1702
1702
1703
1703
auto func = [&]() {
You can’t perform that action at this time.
0 commit comments