Skip to content

Commit 7e331ab

Browse files
committed
clean up
1 parent b4d614d commit 7e331ab

File tree

11 files changed

+106
-335
lines changed

11 files changed

+106
-335
lines changed

scripts/checkpoint_conversion/convert_to_hf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat
4444
storage_writer = HuggingFaceStorageWriter(
4545
path=output_dir,
4646
save_distributed=True,
47-
fqn_to_index_mapping=sd_adapter.fqn_to_index_mapping,
47+
fqn_to_index_mapping=None,
4848
enable_consolidation=True,
4949
thread_count_consolidation=5,
5050
)

torchtitan/components/checkpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def dcp_load(
418418
)
419419

420420
state_dict = self.sd_adapter.from_hf(hf_state_dict)
421-
421+
422422
self.states[MODEL].load_state_dict(state_dict)
423423
else:
424424
dcp.load(state_dict, checkpoint_id=checkpoint_id)

torchtitan/models/deepseek_v3/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
route_norm=True,
4747
score_before_experts=False,
4848
),
49-
q_lora_rank=0,
49+
q_lora_rank=256, # for test, original is 0
5050
kv_lora_rank=512,
5151
qk_nope_head_dim=128,
5252
qk_rope_head_dim=64,

torchtitan/models/deepseek_v3/hf_implementation.py

Lines changed: 0 additions & 177 deletions
This file was deleted.

torchtitan/models/deepseek_v3/infra/parallelize.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ def parallelize_deepseekv3(
3636
job_config: JobConfig,
3737
):
3838
world_mesh = parallel_dims.world_mesh
39-
print(f"In parallelize_deepseekv3, world mesh is {world_mesh}")
4039
# TODO: TP currently cannot handle uneven seq_len because we set
4140
# `use_local_output=True` to use plain Tensors for legacy reasons.
4241
# Need to revisit this.

torchtitan/models/deepseek_v3/model/model.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from torch import nn
1212

1313
from torchtitan.models.attention import build_attention
14-
from torchtitan.models.moe import FeedForward, MoE, print_tensor_stats
14+
from torchtitan.models.moe import FeedForward, MoE
1515
from torchtitan.protocols.train_spec import ModelProtocol
1616

1717
from .args import DeepSeekV3ModelArgs
@@ -295,12 +295,9 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
295295
Returns:
296296
torch.Tensor: Output tensor with the same shape as the input.
297297
"""
298-
print_tensor_stats(f"input of TransformerBlock {self.layer_id}: ", x)
299298
x = x + self.attention(self.attention_norm(x), freqs_cis)
300299
if self.moe_enabled:
301-
x = self.ffn_norm(x)
302-
print_tensor_stats(f"After ffn_norm : ", x)
303-
x = x + self.moe(x)
300+
x = x + self.moe(self.ffn_norm(x))
304301
else:
305302
x = x + self.feed_forward(self.ffn_norm(x))
306303
return x
@@ -388,11 +385,8 @@ def forward(
388385

389386
h = self.tok_embeddings(tokens) if self.tok_embeddings is not None else tokens
390387

391-
392-
token_inputs = h
393388
for layer in self.layers.values():
394-
# reset before each layer
395-
h = layer(token_inputs, self.freqs_cis)
389+
h = layer(h, self.freqs_cis)
396390
h = self.norm(h) if self.norm is not None else h
397391
output = self.output(h) if self.output is not None else h
398392
return output

0 commit comments

Comments
 (0)