chore: memoize weight shuffle index to speed up weight preproc in moe_backend=TRTLLM (NVIDIA#4826)

rosenrodt · web-flow · commit eeb555e37b28 · 2025-06-06T16:13:54.000+08:00
Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
@@ -35,7 +35,7 @@ namespace Routing
 {
 
 // The type of method in top-K routing, for use in torch custom op
-// Please keep this in sync with the counterpart defined in tensorrt_llm/_torch/modules/fused_moe.py
+// Please keep this in sync with the counterpart defined in tensorrt_llm/_torch/modules/fused_moe/routing.py
 enum class RoutingMethodType : int64_t
 {
     // Default: Softmax -> TopK
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1,15 +1,13 @@
-import threading
 from abc import ABC, abstractmethod
-from typing import Dict, List, NamedTuple
+from typing import Dict, List, NamedTuple, Union
 
 import torch
 from torch import nn
 
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.quantization.utils.fp4_utils import (
     float4_sf_dtype, get_reorder_rows_for_gated_act_gemm_row_indices,
-    get_shuffle_matrix_a_row_indices, get_shuffle_matrix_sf_a_row_indices,
-    shuffle_matrix_a, shuffle_matrix_sf_a)
+    get_shuffle_matrix_a_row_indices, get_shuffle_matrix_sf_a_row_indices)
 
 from ..linear import TensorParallelMode, load_weight_shard
 from .interface import MoEWeightLoadingMode
@@ -80,12 +78,8 @@ def create_weights(self, module: torch.nn.Module, weight_dtype: torch.dtype,
 
     def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                      weight_loading_mode: MoEWeightLoadingMode):
-        # Use multi-threading to load expert weights in parallel.
-        # Even though CPython has global interpreter lock (GIL),
-        # it's still faster to load weights in parallel because it can utilize
-        # CPU memory bandwidth better.
-        threads = []
-
+        # Multithread weight load is superseded by prefetch_files() in model_engine.py
+        # Also, threading adds overhead in order to protect shuffle index cache with critical section.
         for local_slot_id, expert_id in enumerate(
                 module.initial_local_expert_ids):
             # expert_idx is the local slot index of current rank
@@ -106,21 +100,11 @@ def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                     f"Unknown weight loading mode in MoE: {weight_loading_mode}"
                 )
 
-            thread = threading.Thread(
-                target=self.load_expert_w3_w1_weight,
-                args=(module, w1_weight, w3_weight,
-                      module.w3_w1_weight.data[expert_idx]))
-            thread.start()
-            threads.append(thread)
-
-            thread = threading.Thread(target=self.load_expert_w2_weight,
-                                      args=(module, w2_weight,
-                                            module.w2_weight.data[expert_idx]))
-            thread.start()
-            threads.append(thread)
+            self.load_expert_w3_w1_weight(module, w1_weight, w3_weight,
+                                          module.w3_w1_weight.data[expert_idx])
 
-        for thread in threads:
-            thread.join()
+            self.load_expert_w2_weight(module, w2_weight,
+                                       module.w2_weight.data[expert_idx])
 
         self.load_quant_scales(module, weights)
         # Re-setup quant scales after loading weights as the tensors may have been modified.
@@ -1011,6 +995,53 @@ class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):
     weight_dtype = float4_sf_dtype
     block_scales_dtype = torch.float8_e4m3fn
 
+    # Cache the permute indices during weight loading to avoid recompute
+    # This assumes the same input shape always results in the same permute indices
+    _cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
+
+    def _maybe_get_cached_w3_w1_permute_indices(
+            self,
+            dst_w3_w1_weight: torch.Tensor,
+            epilogue_tile_m: int,
+            num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
+        if dst_w3_w1_weight.shape not in self._cache_permute_indices:
+            # Get permute indices and chain them together
+            permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
+                dst_w3_w1_weight)
+            if num_elts_per_sf is None:
+                permute1 = get_shuffle_matrix_a_row_indices(
+                    dst_w3_w1_weight, epilogue_tile_m=epilogue_tile_m)
+            else:
+                permute1 = get_shuffle_matrix_sf_a_row_indices(
+                    dst_w3_w1_weight,
+                    epilogue_tile_m=epilogue_tile_m,
+                    num_elts_per_sf=num_elts_per_sf)
+            # Memoize permute indices as recompute is **very** costly
+            self._cache_permute_indices[
+                dst_w3_w1_weight.shape] = permute0[permute1].to(
+                    dst_w3_w1_weight.device)
+        permute_indices = self._cache_permute_indices[dst_w3_w1_weight.shape]
+        return permute_indices
+
+    def _maybe_get_cached_w2_permute_indices(
+            self,
+            dst_w2_weight: torch.Tensor,
+            epilogue_tile_m: int,
+            num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
+        if dst_w2_weight.shape not in self._cache_permute_indices:
+            if num_elts_per_sf is None:
+                permute_indices = (get_shuffle_matrix_a_row_indices(
+                    dst_w2_weight, epilogue_tile_m).to(dst_w2_weight.device))
+            else:
+                permute_indices = (get_shuffle_matrix_sf_a_row_indices(
+                    dst_w2_weight,
+                    epilogue_tile_m=epilogue_tile_m,
+                    num_elts_per_sf=num_elts_per_sf).to(dst_w2_weight.device))
+            # Memoize permute indices as recompute is **very** costly
+            self._cache_permute_indices[dst_w2_weight.shape] = permute_indices
+        permute_indices = self._cache_permute_indices[dst_w2_weight.shape]
+        return permute_indices
+
     def create_weights(self, module: torch.nn.Module):
         weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
         block_scales_vec_size = 1
@@ -1056,16 +1087,13 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         dst_w3_weight.copy_(w3_weight_shard.view(dst_w3_weight.dtype))
         dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype))
 
-        # Get permute indices and chain them together
-        permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
-            dst_w3_w1_weight)
-        permute1 = get_shuffle_matrix_a_row_indices(dst_w3_w1_weight,
-                                                    epilogue_tile_m)
-        permute = permute0[permute1]
+        # Get permute indices
+        permute_indices = self._maybe_get_cached_w3_w1_permute_indices(
+            dst_w3_w1_weight, epilogue_tile_m)
 
         # Shuffle the weight according to permute indices
         processed_w31_weight_shard = torch.ops.trtllm.shuffle_matrix(
-            dst_w3_w1_weight, permute.to(dst_w3_w1_weight.device))
+            dst_w3_w1_weight, permute_indices.to(dst_w3_w1_weight.device))
 
         # Copy the result into device buffer
         dst_w3_w1_weight.copy_(processed_w31_weight_shard.view(
@@ -1085,8 +1113,14 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         # Keep weights in device buffer
         dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
                             non_blocking=True)
-        # Get permuted result
-        processed_w2_weight = shuffle_matrix_a(dst_w2_weight, epilogue_tile_m)
+        # Get permuted indices
+        permute_indices = self._maybe_get_cached_w2_permute_indices(
+            dst_w2_weight, epilogue_tile_m)
+
+        # Shuffle the weight according to permute indices
+        processed_w2_weight = torch.ops.trtllm.shuffle_matrix(
+            dst_w2_weight, permute_indices.to(dst_w2_weight.device))
+
         # Copy the result into device buffer
         dst_w2_weight.copy_(processed_w2_weight.view(dst_w2_weight.dtype),
                             non_blocking=True)
@@ -1121,16 +1155,16 @@ def load_expert_w3_w1_weight_scale_nvfp4(
         # trtllm-gen specific block scales preprocessing logics
         epilogue_tile_m = 128  # FIXME
 
-        # Get permute indices and chain them together
-        permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
-            dst_w3_w1_weight_scale)
-        permute1 = get_shuffle_matrix_sf_a_row_indices(
-            dst_w3_w1_weight_scale.view(float4_sf_dtype), epilogue_tile_m, 16)
-        permute = permute0[permute1]
+        # Get permute indices
+        permute_indices = self._maybe_get_cached_w3_w1_permute_indices(
+            dst_w3_w1_weight_scale.view(float4_sf_dtype),
+            epilogue_tile_m,
+            num_elts_per_sf=16)
 
         # Shuffle the weight according to permute indices
         w3_w1_weight_scale = torch.ops.trtllm.shuffle_matrix(
-            dst_w3_w1_weight_scale.view(float4_sf_dtype), permute.cuda())
+            dst_w3_w1_weight_scale.view(float4_sf_dtype), permute_indices)
+
         # Assert should only be removed during debugging
         assert w3_w1_weight_scale.is_cuda, "w3_w1_weight_scale.is_cuda should be true or suffer from slow speed"
         # Interleave the weight.
@@ -1155,13 +1189,26 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
 
         # trtllm-gen specific block scales preprocessing logics
         epilogue_tile_m = 128  # FIXME: read from kernel
+
         # Assert should only be removed during debugging
         assert dst_w2_weight_scale.is_cuda, "dst_w2_weight_scale.is_cuda should be true or suffer from slow speed"
-        # Interleave the weight and copy
+
+        # Get permute indices
+        permute_indices = self._maybe_get_cached_w2_permute_indices(
+            dst_w2_weight_scale.view(float4_sf_dtype),
+            epilogue_tile_m,
+            num_elts_per_sf=16)
+
+        # Shuffle the weight according to permute indices
+        w_shuffled = torch.ops.trtllm.shuffle_matrix(
+            dst_w2_weight_scale.view(dtype=float4_sf_dtype), permute_indices)
+        # Interleave the weight.
+        processed_w2_weight_scale = torch.ops.tensorrt_llm.nvfp4_block_scale_interleave(
+            w_shuffled)
+        # Copy the result into device buffer
         dst_w2_weight_scale.copy_(
-            shuffle_matrix_sf_a(
-                dst_w2_weight_scale.view(float4_sf_dtype), epilogue_tile_m,
-                16).view(self.block_scales_dtype).reshape(orig_shape))
+            processed_w2_weight_scale.view(
+                self.block_scales_dtype).reshape(orig_shape))
 
     def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         super().load_quant_scales(module, weights)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1221,15 +1221,9 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     @skip_pre_blackwell
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
-        [
-            (1, 1, 1, True, True, True, "CUTLASS"),
-            # TODO: enable TRTLLM backend
-            # (1, 1, 1, True, True, True, "TRTLLM"),
-        ],
-        ids=[
-            "latency_moe_cutlass",
-            # "latency_moe_trtllm",
-        ],
+        [(1, 1, 1, True, True, True, "CUTLASS"),
+         (1, 1, 1, False, True, True, "TRTLLM")],
+        ids=["latency_moe_cutlass", "latency_moe_trtllm"],
     )
     def test_nvfp4(
         self,
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -472,6 +472,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[throughput_tp
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt
@@ -135,6 +135,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-att
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 
 # Pivot to Pytorch test cases.
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -41,6 +41,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
   - test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
diff --git a/tests/unittest/_torch/thop/test_moe.py b/tests/unittest/_torch/thop/test_moe.py
@@ -705,7 +705,7 @@ def check_accuracy(a, b, atol, rtol, percent):
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Qwen3
             },
-            id="Qwen3"),
+            id="RoutingQwen3"),
     ],
 )
 def test_moe_fp4(num_tokens, hidden_size, intermediate_size, routing_info):

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ namespace Routing`
`35`	`35`	`{`
`36`	`36`
`37`	`37`	`// The type of method in top-K routing, for use in torch custom op`
`38`		`-// Please keep this in sync with the counterpart defined in tensorrt_llm/_torch/modules/fused_moe.py`
	`38`	`+// Please keep this in sync with the counterpart defined in tensorrt_llm/_torch/modules/fused_moe/routing.py`
`39`	`39`	`enum class RoutingMethodType : int64_t`
`40`	`40`	`{`
`41`	`41`	`// Default: Softmax -> TopK`
Original file line number	Diff line number	Diff line change
`@@ -705,7 +705,7 @@ def check_accuracy(a, b, atol, rtol, percent):`
`705`	`705`	`"has_routing_bias": False,`
`706`	`706`	`"routing_method_type": RoutingMethodType.Qwen3`
`707`	`707`	`},`
`708`		`- id="Qwen3"),`
	`708`	`+ id="RoutingQwen3"),`
`709`	`709`	`],`
`710`	`710`	`)`
`711`	`711`	`def test_moe_fp4(num_tokens, hidden_size, intermediate_size, routing_info):`