fix loading error

wwwjn · wwwjn · commit 6528e9d51663 · 2025-08-26T13:39:43.000-07:00
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -418,16 +418,7 @@ def dcp_load(
             )
 
             state_dict = self.sd_adapter.from_hf(hf_state_dict)
-
-            # [rank0]:after sd converter, placement is DeviceMesh((dp_shard_mod_ep=2, dp_shard_in_ep=2, tp=2), device: 'cuda', stride: (4, 2, 1))
-            print(
-                f"after sd converter, placement is {state_dict['layers.3.moe.experts.w3'].device_mesh}, type {type(state_dict['layers.3.moe.experts.w3'])}, placement {state_dict['layers.3.moe.experts.w3'].placements}"
-            )
-
-            # [rank0]:after sd converter, model placement is DeviceMesh((dp_shard_mod_ep=2, ep=2, tp=2), device: 'cuda', stride: (4, 2, 1))
-            # model_state_dict = self.states[MODEL].state_dict()
-            # print(f"after sd converter, model placement is {model_state_dict['layers.3.moe.experts.w3'].device_mesh}")
-
+            
             self.states[MODEL].load_state_dict(state_dict)
         else:
             dcp.load(state_dict, checkpoint_id=checkpoint_id)
diff --git a/torchtitan/models/deepseek_v3/model/state_dict_adapter.py b/torchtitan/models/deepseek_v3/model/state_dict_adapter.py
@@ -6,6 +6,7 @@
 
 
 import re
+from threading import local
 from typing import Any, Dict
 
 import torch
@@ -276,54 +277,15 @@ def _get_local_experts_weights(
             local_expert_tensors[expert_key] = expert_dtensor
 
         return local_expert_tensors
-
-    def _chunk_local_expert_weights(
-        self,
-        local_tensor: torch.Tensor,
-        dtensor_placements: tuple,
-        dtensor_shape: tuple,
-        device_mesh: DeviceMesh,
-    ):
-        """
-        Chunk the local individual experts weight, assemble back to GroupedExperts weights DTensor.
-
-        This method is a placeholder for future implementation of expert weight concatenation.
-
-        Args:
-            local_tensor: Concatenated local individual expert weights
-        """
-
-        # Calculate the index range on dim-i to chunk
-        for i in range(1, len(dtensor_placements)):
-            dim_size = dtensor_shape[i]
-            start_index, end_index = self._caculate_indices_from_placements(
-                dim=i,
-                dim_size=dim_size,
-                dtensor_placements=dtensor_placements,
-                device_mesh=device_mesh,
-            )
-            # No need to chunk on current dimension
-            if start_index is None or end_index is None:
-                continue
-
-            # Chunk local_tensor on dim-i
-            local_tensor = local_tensor.narrow(i, start_index, end_index - start_index)
-
-        # Assemble DTensor
-        grouped_expert_weights = DTensor.from_local(
-            local_tensor, device_mesh, dtensor_placements, run_check=False
-        )
-
-        return grouped_expert_weights
-
+        
     def _concatenate_local_expert_weights(
         self,
         expert_weights_by_layer: dict[str, Any],
         abstract_key: str,
         device_mesh: DeviceMesh,
     ) -> torch.Tensor:
         """
-        Concatenate the weights of separate experts into GroupedExperts weights.
+        Try to concatenate the weights of separate experts into GroupedExperts weights.
         """
         for layer in expert_weights_by_layer.keys():
             # If we have all the experts for this abstract_key, concatenate them
@@ -335,20 +297,15 @@ def _concatenate_local_expert_weights(
             if len(experts) == expected_n_experts:
                 sorted_expert_ids = sorted(experts.keys())
                 sorted_experts = [experts[i] for i in sorted_expert_ids]
-                local_tensor = torch.stack(sorted_experts, dim=0)
-
+                local_tensor = torch.stack(sorted_experts, dim=0)._local_tensor
+                
                 assert (
                     abstract_key in self.grouped_expert_weight_placements
                     and abstract_key in self.grouped_expert_weight_shape
                 ), f"GroupedExperts weight metadata {self.grouped_expert_weight_placements} {self.grouped_expert_weight_shape} can not be None!"
 
-                stacked_dtensor = self._chunk_local_expert_weights(
-                    local_tensor,
-                    dtensor_placements=self.grouped_expert_weight_placements[
-                        abstract_key
-                    ],
-                    dtensor_shape=self.grouped_expert_weight_shape[abstract_key],
-                    device_mesh=device_mesh,
+                stacked_dtensor = DTensor.from_local(
+                    local_tensor, device_mesh, self.grouped_expert_weight_placements[abstract_key], run_check=False
                 )
 
                 # Remove these experts from the tracking dict to free memory