pytorch
diff --git a/‎scripts/checkpoint_conversion/convert_to_hf.py
Lines changed: 1 addition & 1 deletion b/‎scripts/checkpoint_conversion/convert_to_hf.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/components/checkpoint.py
Lines changed: 1 addition & 1 deletion b/‎torchtitan/components/checkpoint.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/deepseek_v3/__init__.py
Lines changed: 1 addition & 1 deletion b/‎torchtitan/models/deepseek_v3/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/deepseek_v3/hf_implementation.py
Lines changed: 0 additions & 177 deletions b/‎torchtitan/models/deepseek_v3/hf_implementation.py
Lines changed: 0 additions & 177 deletions
diff --git a/‎torchtitan/models/deepseek_v3/infra/parallelize.py
Lines changed: 0 additions & 1 deletion b/‎torchtitan/models/deepseek_v3/infra/parallelize.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎torchtitan/models/deepseek_v3/model/model.py
Lines changed: 3 additions & 9 deletions b/‎torchtitan/models/deepseek_v3/model/model.py
Lines changed: 3 additions & 9 deletions
@@ -44,7 +44,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat
     storage_writer = HuggingFaceStorageWriter(
         path=output_dir,
         save_distributed=True,
-        fqn_to_index_mapping=sd_adapter.fqn_to_index_mapping,
+        fqn_to_index_mapping=None,
         enable_consolidation=True,
         thread_count_consolidation=5,
     )
 
@@ -418,7 +418,7 @@ def dcp_load(
             )
 
             state_dict = self.sd_adapter.from_hf(hf_state_dict)
-            
+
             self.states[MODEL].load_state_dict(state_dict)
         else:
             dcp.load(state_dict, checkpoint_id=checkpoint_id)
 
@@ -46,7 +46,7 @@
             route_norm=True,
             score_before_experts=False,
         ),
-        q_lora_rank=0,
+        q_lora_rank=256,  # for test, original is 0
         kv_lora_rank=512,
         qk_nope_head_dim=128,
         qk_rope_head_dim=64,
 
@@ -36,7 +36,6 @@ def parallelize_deepseekv3(
     job_config: JobConfig,
 ):
     world_mesh = parallel_dims.world_mesh
-    print(f"In parallelize_deepseekv3, world mesh is {world_mesh}")
     # TODO: TP currently cannot handle uneven seq_len because we set
     #       `use_local_output=True` to use plain Tensors for legacy reasons.
     #       Need to revisit this.
 
@@ -11,7 +11,7 @@
 from torch import nn
 
 from torchtitan.models.attention import build_attention
-from torchtitan.models.moe import FeedForward, MoE, print_tensor_stats
+from torchtitan.models.moe import FeedForward, MoE
 from torchtitan.protocols.train_spec import ModelProtocol
 
 from .args import DeepSeekV3ModelArgs
@@ -295,12 +295,9 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
         Returns:
             torch.Tensor: Output tensor with the same shape as the input.
         """
-        print_tensor_stats(f"input of TransformerBlock {self.layer_id}: ", x)
         x = x + self.attention(self.attention_norm(x), freqs_cis)
         if self.moe_enabled:
-            x = self.ffn_norm(x)
-            print_tensor_stats(f"After ffn_norm : ", x)
-            x = x + self.moe(x)
+            x = x + self.moe(self.ffn_norm(x))
         else:
             x = x + self.feed_forward(self.ffn_norm(x))
         return x
@@ -388,11 +385,8 @@ def forward(
 
         h = self.tok_embeddings(tokens) if self.tok_embeddings is not None else tokens
 
-
-        token_inputs = h
         for layer in self.layers.values():
-            # reset before each layer
-            h = layer(token_inputs, self.freqs_cis)
+            h = layer(h, self.freqs_cis)
         h = self.norm(h) if self.norm is not None else h
         output = self.output(h) if self.output is not None else h
         return output
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat`
`44`	`44`	`storage_writer = HuggingFaceStorageWriter(`
`45`	`45`	`path=output_dir,`
`46`	`46`	`save_distributed=True,`
`47`		`- fqn_to_index_mapping=sd_adapter.fqn_to_index_mapping,`
	`47`	`+ fqn_to_index_mapping=None,`
`48`	`48`	`enable_consolidation=True,`
`49`	`49`	`thread_count_consolidation=5,`
`50`	`50`	`)`
Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,7 @@ def dcp_load(`
`418`	`418`	`)`
`419`	`419`
`420`	`420`	`state_dict = self.sd_adapter.from_hf(hf_state_dict)`
`421`		`-`
	`421`	`+`
`422`	`422`	`self.states[MODEL].load_state_dict(state_dict)`
`423`	`423`	`else:`
`424`	`424`	`dcp.load(state_dict, checkpoint_id=checkpoint_id)`