Skip to content

Commit 85a7043

Browse files
committed
further clean up
1 parent 7e331ab commit 85a7043

File tree

5 files changed

+10
-11
lines changed

5 files changed

+10
-11
lines changed

scripts/checkpoint_conversion/convert_to_hf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat
4444
storage_writer = HuggingFaceStorageWriter(
4545
path=output_dir,
4646
save_distributed=True,
47-
fqn_to_index_mapping=None,
47+
fqn_to_index_mapping=sd_adapter.fqn_to_index_mapping,
4848
enable_consolidation=True,
4949
thread_count_consolidation=5,
5050
)

torchtitan/components/checkpoint.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,6 @@ def dcp_load(
418418
)
419419

420420
state_dict = self.sd_adapter.from_hf(hf_state_dict)
421-
422421
self.states[MODEL].load_state_dict(state_dict)
423422
else:
424423
dcp.load(state_dict, checkpoint_id=checkpoint_id)

torchtitan/models/deepseek_v3/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
route_norm=True,
4747
score_before_experts=False,
4848
),
49-
q_lora_rank=256, # for test, original is 0
49+
q_lora_rank=0,
5050
kv_lora_rank=512,
5151
qk_nope_head_dim=128,
5252
qk_rope_head_dim=64,
@@ -135,7 +135,7 @@
135135
dim=7168,
136136
inter_dim=18432,
137137
moe_inter_dim=2048,
138-
n_layers=4,
138+
n_layers=61,
139139
n_dense_layers=3,
140140
n_heads=128,
141141
moe_args=MoEArgs(

torchtitan/models/deepseek_v3/train_configs/debug_model.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@ dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)
4747
data_parallel_replicate_degree = 1
4848
data_parallel_shard_degree = -1
4949
fsdp_reshard_after_forward = "default" # default / never / always
50-
tensor_parallel_degree = 4
50+
tensor_parallel_degree = 1
5151
enable_async_tensor_parallel = false
5252
pipeline_parallel_degree = 1
5353
pipeline_parallel_schedule = "1F1B"
5454
context_parallel_degree = 1
55-
expert_parallel_degree = 2
56-
expert_tensor_parallel_degree = 4
55+
expert_parallel_degree = 1
56+
expert_tensor_parallel_degree = 1
5757

5858
[checkpoint]
5959
enable = false

torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ decay_type = "cosine"
3535
min_lr_factor = 0.1
3636

3737
[training]
38-
local_batch_size = 2
39-
seq_len = 2048
38+
local_batch_size = 4
39+
seq_len = 4096
4040
max_norm = 1.0 # grad norm clipping
41-
steps = 10
41+
steps = 10_000
4242
compile = false
4343
dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
4444

@@ -56,7 +56,7 @@ expert_tensor_parallel_degree = 1
5656
[checkpoint]
5757
enable = false
5858
folder = "checkpoint"
59-
interval = 10
59+
interval = 500
6060
last_save_model_only = true
6161
export_dtype = "float32"
6262
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem]"

0 commit comments

Comments
 (0)