pytorch
diff --git a/‎torchtitan/components/checkpoint.py
Lines changed: 10 additions & 0 deletions b/‎torchtitan/components/checkpoint.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎torchtitan/models/deepseek_v3/infra/parallelize.py
Lines changed: 1 addition & 0 deletions b/‎torchtitan/models/deepseek_v3/infra/parallelize.py
Lines changed: 1 addition & 0 deletions
@@ -418,6 +418,16 @@ def dcp_load(
             )
 
             state_dict = self.sd_adapter.from_hf(hf_state_dict)
+
+            # [rank0]:after sd converter, placement is DeviceMesh((dp_shard_mod_ep=2, dp_shard_in_ep=2, tp=2), device: 'cuda', stride: (4, 2, 1))
+            print(
+                f"after sd converter, placement is {state_dict['layers.3.moe.experts.w3'].device_mesh}, type {type(state_dict['layers.3.moe.experts.w3'])}, placement {state_dict['layers.3.moe.experts.w3'].placements}"
+            )
+
+            # [rank0]:after sd converter, model placement is DeviceMesh((dp_shard_mod_ep=2, ep=2, tp=2), device: 'cuda', stride: (4, 2, 1))
+            # model_state_dict = self.states[MODEL].state_dict()
+            # print(f"after sd converter, model placement is {model_state_dict['layers.3.moe.experts.w3'].device_mesh}")
+
             self.states[MODEL].load_state_dict(state_dict)
         else:
             dcp.load(state_dict, checkpoint_id=checkpoint_id)
 
@@ -36,6 +36,7 @@ def parallelize_deepseekv3(
     job_config: JobConfig,
 ):
     world_mesh = parallel_dims.world_mesh
+    print(f"In parallelize_deepseekv3, world mesh is {world_mesh}")
     # TODO: TP currently cannot handle uneven seq_len because we set
     #       `use_local_output=True` to use plain Tensors for legacy reasons.
     #       Need to revisit this.