File tree Expand file tree Collapse file tree 1 file changed +7
-6
lines changed
tensorrt_llm/_torch/auto_deploy/models/patches Expand file tree Collapse file tree 1 file changed +7
-6
lines changed Original file line number Diff line number Diff line change @@ -93,14 +93,15 @@ def _nemotron_h_topk_router_forward(self, hidden_states):
9393 Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.
9494
9595 This replaces the original forward method which used pure PyTorch operations
96- with a fused CUDA kernel that performs:
97- 1. Sigmoid activation of logits
98- 2. Group-based expert selection
99- 3. Top-k selection within selected groups
100- 4. Normalized weight computation
96+ with optimized CUDA kernels:
10197 """
10298 hidden_states = hidden_states .view (- 1 , self .config .hidden_size )
103- router_logits = F .linear (hidden_states .type (torch .float32 ), self .weight .type (torch .float32 ))
99+ if self .weight .dtype == torch .float32 :
100+ router_logits = F .linear (hidden_states .type (torch .float32 ), self .weight )
101+ else :
102+ router_logits = torch .ops .trtllm .dsv3_router_gemm_op (
103+ hidden_states , self .weight .t (), bias = None , out_dtype = torch .float32
104+ )
104105
105106 # Use the fused noaux_tc_op kernel which applies sigmoid internally
106107 # and performs group-based top-k selection with normalization
You can’t perform that action at this time.
0 commit comments