Skip to content

Commit 2651521

Browse files
committed
modify model size for dev
1 parent 47628e4 commit 2651521

File tree

4 files changed

+22
-6
lines changed

4 files changed

+22
-6
lines changed

llm/config/deepseek-v2/pretrain_argument.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"model_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
3-
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
2+
"model_name_or_path": "deepseek-ai/DeepSeek-V3",
3+
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V3",
44
"input_dir": "./data",
55
"output_dir": "./checkpoints/pretrain_ckpts",
66
"per_device_train_batch_size": 1,

llm/config/deepseek-v2/sft_argument.json

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
2-
"model_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
2+
"model_name_or_path": "deepseek-ai/DeepSeek-V3",
33
"dataset_name_or_path": "./data",
44
"output_dir": "./checkpoints/sft_ckpts",
55
"per_device_train_batch_size": 1,
6-
"gradient_accumulation_steps": 4,
7-
"per_device_eval_batch_size": 8,
8-
"eval_accumulation_steps":16,
6+
"gradient_accumulation_steps": 1,
7+
"per_device_eval_batch_size": 1,
8+
"eval_accumulation_steps":1,
99
"num_train_epochs": 3,
1010
"learning_rate": 3e-05,
1111
"warmup_steps": 30,
@@ -27,6 +27,7 @@
2727
"tensor_parallel_degree": 1,
2828
"pipeline_parallel_degree": 1,
2929
"sharding": "stage2",
30+
"sharding_parallel_degree": 1,
3031
"zero_padding": false,
3132
"unified_checkpoint": true,
3233
"use_flash_attention": true

llm/run_finetune.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,13 @@ def main():
226226
)
227227
else:
228228
# NOTE(gongenlei): new add autotuner_benchmark
229+
# 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
230+
model_config.num_hidden_layers = 2 # v3是61
231+
model_config.first_k_dense_replace = 1 # v3是3
232+
# 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
233+
model_config.n_routed_experts = 16 # v3是256
234+
model_config.num_experts_per_tok = 4 # v3是8
235+
model_config.topk_group = 2 # v3是4
229236
model = model_class.from_config(model_config, dtype=dtype)
230237

231238
if model_args.flash_mask and (not data_args.zero_padding or not model.config.use_flash_attention):

llm/run_pretrain.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,14 @@ def main():
496496
dtype=dtype,
497497
)
498498
else:
499+
# 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
500+
config.num_hidden_layers = 2 # v3是61
501+
config.first_k_dense_replace = 1 # v3是3
502+
# 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
503+
config.n_routed_experts = 16 # v3是256
504+
config.num_experts_per_tok = 4 # v3是8
505+
config.topk_group = 2 # v3是4
506+
499507
model = model_class.from_config(config, dtype=dtype)
500508

501509
if training_args.recompute:

0 commit comments

Comments
 (0)