File tree Expand file tree Collapse file tree 4 files changed +22
-6
lines changed
Expand file tree Collapse file tree 4 files changed +22
-6
lines changed Original file line number Diff line number Diff line change 11{
2- "model_name_or_path" : " deepseek-ai/DeepSeek-V2-Lite " ,
3- "tokenizer_name_or_path" : " deepseek-ai/DeepSeek-V2-Lite " ,
2+ "model_name_or_path" : " deepseek-ai/DeepSeek-V3 " ,
3+ "tokenizer_name_or_path" : " deepseek-ai/DeepSeek-V3 " ,
44 "input_dir" : " ./data" ,
55 "output_dir" : " ./checkpoints/pretrain_ckpts" ,
66 "per_device_train_batch_size" : 1 ,
Original file line number Diff line number Diff line change 11{
2- "model_name_or_path" : " deepseek-ai/DeepSeek-V2-Lite " ,
2+ "model_name_or_path" : " deepseek-ai/DeepSeek-V3 " ,
33 "dataset_name_or_path" : " ./data" ,
44 "output_dir" : " ./checkpoints/sft_ckpts" ,
55 "per_device_train_batch_size" : 1 ,
6- "gradient_accumulation_steps" : 4 ,
7- "per_device_eval_batch_size" : 8 ,
8- "eval_accumulation_steps" :16 ,
6+ "gradient_accumulation_steps" : 1 ,
7+ "per_device_eval_batch_size" : 1 ,
8+ "eval_accumulation_steps" :1 ,
99 "num_train_epochs" : 3 ,
1010 "learning_rate" : 3e-05 ,
1111 "warmup_steps" : 30 ,
2727 "tensor_parallel_degree" : 1 ,
2828 "pipeline_parallel_degree" : 1 ,
2929 "sharding" : " stage2" ,
30+ "sharding_parallel_degree" : 1 ,
3031 "zero_padding" : false ,
3132 "unified_checkpoint" : true ,
3233 "use_flash_attention" : true
Original file line number Diff line number Diff line change @@ -226,6 +226,13 @@ def main():
226226 )
227227 else :
228228 # NOTE(gongenlei): new add autotuner_benchmark
229+ # 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
230+ model_config .num_hidden_layers = 2 # v3是61
231+ model_config .first_k_dense_replace = 1 # v3是3
232+ # 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
233+ model_config .n_routed_experts = 16 # v3是256
234+ model_config .num_experts_per_tok = 4 # v3是8
235+ model_config .topk_group = 2 # v3是4
229236 model = model_class .from_config (model_config , dtype = dtype )
230237
231238 if model_args .flash_mask and (not data_args .zero_padding or not model .config .use_flash_attention ):
Original file line number Diff line number Diff line change @@ -496,6 +496,14 @@ def main():
496496 dtype = dtype ,
497497 )
498498 else :
499+ # 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
500+ config .num_hidden_layers = 2 # v3是61
501+ config .first_k_dense_replace = 1 # v3是3
502+ # 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
503+ config .n_routed_experts = 16 # v3是256
504+ config .num_experts_per_tok = 4 # v3是8
505+ config .topk_group = 2 # v3是4
506+
499507 model = model_class .from_config (config , dtype = dtype )
500508
501509 if training_args .recompute :
You can’t perform that action at this time.
0 commit comments