save initial changes for hack

brb-nv · brb-nv · commit 521185a8a2c2 · 2025-12-15T00:25:34.000Z
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -899,23 +899,25 @@ def __init__(
             requires_grad=False,
         )
 
-        # Compute the correct rank for the combined TP*CP mapping.
-        # The attention heads are split first by TP, then by CP within each TP group.
-        # Original rank order: pp_rank * tp_size * cp_size + cp_rank * tp_size + tp_rank
-        # For o_proj, we need: pp_rank * tp_size * cp_size + tp_rank * cp_size + cp_rank
-        # This ensures weight slices align with the actual head partitions.
-        new_rank_for_o = (self.mapping.pp_rank * tp_size * cp_size +
-                         self.mapping.tp_rank * cp_size + self.mapping.cp_rank)
-        print(f"[MLA::create_weights][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}][tp_rank {self.mapping.tp_rank}]: new_rank_for_o: {new_rank_for_o}")
+        # # Compute the correct rank for the combined TP*CP mapping.
+        # # The attention heads are split first by TP, then by CP within each TP group.
+        # # Original rank order: pp_rank * tp_size * cp_size + cp_rank * tp_size + tp_rank
+        # # For o_proj, we need: pp_rank * tp_size * cp_size + tp_rank * cp_size + cp_rank
+        # # This ensures weight slices align with the actual head partitions.
+        # new_rank_for_o = (self.mapping.pp_rank * tp_size * cp_size +
+        #                  self.mapping.tp_rank * cp_size + self.mapping.cp_rank)
+        # print(f"[MLA::create_weights][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}][tp_rank {self.mapping.tp_rank}]: new_rank_for_o: {new_rank_for_o}")
         mapping_o = Mapping(
             world_size=tp_size * pp_size * cp_size,
             tp_size=tp_size * cp_size,
             pp_size=pp_size,
             cp_size=1,
-            rank=new_rank_for_o,
+            rank=self.mapping.rank,
             gpus_per_node=self.mapping.gpus_per_node,
             enable_attention_dp=self.mapping.enable_attention_dp,
         )
+        # TODO: Update this for all layers.
+        weight_name = "o_proj_with_cp" if self.mapping.has_cp_helix() and self.layer_idx == 0 else None
         self.o_proj = Linear(
             self.num_key_value_heads * self.v_head_dim,
             self.hidden_size,
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -68,7 +68,10 @@ def load_weight_shard(
     tensor_parallel_mode: Optional[TensorParallelMode] = None,
     device: torch.device = torch.device('cpu'),
     return_slice_indices: bool = False,
+    weight_name: Optional[str] = None,
 ) -> torch.Tensor:
+    if weight_name is not None:
+        print(f"[load_weight_shard] weight_name: {weight_name}")
     # Skip device transfers on integrated GPUs to conserve shared memory
     if weight.device.type != device.type and is_device_integrated():
         # For integrated GPU systems (e.g., DGX Spark), CPU and GPU share limited physical memory.
@@ -112,6 +115,10 @@ def maybe_convert_to_torch_tensor(
     if width == 1:
         return maybe_convert_to_torch_tensor(weight)
 
+    if weight_name is not None and tensor_parallel_rank == 1:
+        print(f"[load_weight_shard] THIS IS WHERE YOU SWAP RANK 1.")
+    if weight_name is not None and tensor_parallel_rank == 2:
+        print(f"[load_weight_shard] THIS IS WHERE YOU SWAP RANK 2.")
     slice_width = math.ceil(width / tensor_parallel_size)
     slice_start = tensor_parallel_rank * slice_width
     slice_end = min((tensor_parallel_rank + 1) * slice_width, width)
@@ -140,7 +147,10 @@ def load_weights_vanilla_helper(module: Linear,
                                 weights: List[Dict],
                                 weight_transform=lambda x: x,
                                 bias_transform=lambda x: x,
-                                allow_partial_loading: bool = False):
+                                allow_partial_loading: bool = False,
+                                weight_name: Optional[str] = None):
+    if weight_name is not None:
+        print(f"[load_weights_vanilla_helper] weight_name: {weight_name}")
     assert len(weights) == 1
     if not allow_partial_loading:
         assert "weight" in weights[0]
@@ -150,7 +160,7 @@ def load_weights_vanilla_helper(module: Linear,
 
     weight = load_weight_shard(weights[0]['weight'], module.tp_size,
                                module.tp_rank, module.tp_mode,
-                               device) if "weight" in weights[0] else None
+                               device, weight_name=weight_name) if "weight" in weights[0] else None
 
     if weight is not None:
         if module.has_weight_only_quant:
@@ -167,7 +177,7 @@ def load_weights_vanilla_helper(module: Linear,
     if module.bias is not None:
         bias = load_weight_shard(weights[0]['bias'], module.tp_size,
                                  module.tp_rank, module.tp_mode,
-                                 device) if "bias" in weights[0] else None
+                                 device, weight_name=weight_name) if "bias" in weights[0] else None
         if bias is not None:
             copy_weight(module.bias, bias_transform(bias))
 
@@ -311,7 +321,8 @@ def load_weights(self,
                      module: Linear,
                      weights: List[Dict],
                      weight_mode: WeightMode,
-                     allow_partial_loading: bool = False):
+                     allow_partial_loading: bool = False,
+                     weight_name: Optional[str] = None):
         """
         Load weights from the checkpoint.
         """
@@ -396,10 +407,14 @@ def apply(self, module: Linear, input: torch.Tensor,
     def load_weights_vanilla(self,
                              module: Linear,
                              weights: List[Dict],
-                             allow_partial_loading: bool = False) -> None:
+                             allow_partial_loading: bool = False,
+                             weight_name: Optional[str] = None) -> None:
+        if weight_name is not None:
+            print(f"[UnquantizedLinearMethod::load_weights_vanilla] weight_name: {weight_name}")
         load_weights_vanilla_helper(module,
                                     weights,
-                                    allow_partial_loading=allow_partial_loading)
+                                    allow_partial_loading=allow_partial_loading,
+                                    weight_name=weight_name)
 
     def load_weights_fused_qkv_linear(
             self,
@@ -2058,6 +2073,8 @@ def __init__(
         disable_deep_gemm: bool = False,
         fused_weight_shard_indices_mapping: Optional[dict] = None,
         nvfp4_allowed_backends: Optional[List[str]] = None,
+        weight_name: Optional[str] = None,
+        mapping_with_cp: Optional[Mapping] = None,
     ):
         """
         Args:
@@ -2098,6 +2115,12 @@ def __init__(
             'cutlass', 'cublaslt', 'cuda_core'
         ]
 
+        if mapping_with_cp is not None and weight_name == "o_proj":
+            print("[Linear::__init__] Found o_proj with CP mapping. Setting weight_name to o_proj_with_cp.")
+            self.weight_name = "o_proj_with_cp"
+        else:
+            self.weight_name = None
+
         local_in_features = in_features
         local_out_features = out_features
 
@@ -2284,7 +2307,8 @@ def load_weights(self,
             self,
             weights,
             weight_mode,
-            allow_partial_loading=allow_partial_loading)
+            allow_partial_loading=allow_partial_loading,
+            weight_name=self.weight_name)
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)