Use the correct layout

dongfengy · dongfengy · commit 3de4d5df5439 · 2025-11-25T02:18:46.000Z
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -794,16 +794,19 @@ def load_nvfp4_weights(self, weights: Dict):
                 gate_up_bias = module_weights.get('gate_up_proj_bias', None)
                 down_bias = module_weights.get('down_proj_bias', None)
 
-                # Optional deinterleave for checkpoints that interleave gate/up
-                if gate_up is not None and gate_up.dim() == 3:
-                    try:
-                        g, u = gate_up[:, :, ::2], gate_up[:, :, 1::2]
-                        gate_up = torch.cat([g, u], dim=-1)
-                        if gate_up_bias is not None:
-                            gb, ub = gate_up_bias[:, ::2], gate_up_bias[:, 1::2]
-                            gate_up_bias = torch.cat([gb, ub], dim=-1)
-                    except Exception:
-                        pass
+                def deinterleave(tensor):
+                    g, u = tensor[..., ::2], tensor[..., 1::2]
+                    return torch.cat([g, u], dim=-1)
+
+                print("up projection shape before deinterleave:", gate_up.shape)
+                gate_up = deinterleave(gate_up)
+                print("up projection shape after deinterleave:", gate_up.shape)
+
+                print("up projection bias shape before deinterleave:",
+                      gate_up_bias.shape)
+                gate_up_bias = deinterleave(gate_up_bias)
+                print("up projection bias shape after deinterleave:",
+                      gate_up_bias.shape)
 
                 # Only fp32 bias is supported for NVFP4 MoE.
                 if gate_up_bias.dtype != torch.float32:
@@ -832,6 +835,13 @@ def load_nvfp4_weights(self, weights: Dict):
                 # Per-expert block scales (transpose to expected layout)
                 if 'gate_up_proj_weight_scale' in module_weights:
                     gu_ws = module_weights['gate_up_proj_weight_scale']
+                    print(
+                        "up projection weight scale shape before deinterleave:",
+                        gu_ws.shape)
+                    gu_ws = deinterleave(gu_ws)
+                    print(
+                        "up projection weight scale shape after deinterleave:",
+                        gu_ws.shape)
                     moe_weights['gate_up_proj_weight_scale'] = [
                         gu_ws[i, :, :] for i in range(num_expert)
                     ]