Skip to content

Commit 879a853

Browse files
authored
[AutoConfig]add benchmark scripts (PaddlePaddle#7897)
* add auto_tuner * fix * update log_file * update json * close eval/predict * fix run_mode * update * fix * Revert "fix" This reverts commit e526c86. * Revert "update" This reverts commit 9cbd773. * update prepare * Revert "Revert "update"" This reverts commit 811b6a4. * Revert "Revert "fix"" This reverts commit 32cc005. * update finetune prepare * update * add * update sft/lora steps * update json * update * add benchmark * update years * update a100
1 parent 95c0dd4 commit 879a853

14 files changed

+799
-0
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
autoconfig_json_file=$(basename "$1") # autoconfig/llama7b_pretrain.json
18+
model_name=${autoconfig_json_file%.*}
19+
auto_log_file=./autoconfig/${model_name}_auto_tuner.log
20+
21+
if [ -f "$auto_log_file" ] && grep -q "Launch best cfg:" "$auto_log_file"; then
22+
echo "autotuner 已找到最优配置"
23+
if [ -d "./autoconfig/best_cfg" ]; then
24+
echo "autotuner 已执行最优配置"
25+
exit 0
26+
else
27+
echo "autotuner 未执行最优配置"
28+
exit -1
29+
fi
30+
else
31+
echo "autotuner 执行失败,请检查日志文件是否存在或是否包含指定文本!"
32+
exit -1
33+
fi
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
{
2+
"dp_degree": "auto",
3+
"invalid_strategy": [
4+
"stage3_mp*"
5+
],
6+
"max_search_time": 900,
7+
"max_time_per_task": 300,
8+
"metric_cfg": {
9+
"OptimizationDirection": "Maximize",
10+
"name": "interval_samples_per_second"
11+
},
12+
"micro_batch_size": "auto",
13+
"mode": "LoRA",
14+
"model_cfg": {
15+
"global_batch_size": 8,
16+
"hidden_size": 4096,
17+
"num_attention_heads": 32,
18+
"num_layers": 28,
19+
"vocab_size": 65024
20+
},
21+
"mp_degree": [
22+
1
23+
],
24+
"need_baseline": true,
25+
"pp_degree": [
26+
1
27+
],
28+
"run_cmd": {
29+
"gradient_accumulation_steps": [
30+
"./autoconfig/llama7b_lora_params.json",
31+
"gradient_accumulation_steps"
32+
],
33+
"micro_batch_size": [
34+
"./autoconfig/llama7b_lora_params.json",
35+
"per_device_train_batch_size"
36+
],
37+
"mp_degree": [
38+
"./autoconfig/llama7b_lora_params.json",
39+
"tensor_parallel_degree"
40+
],
41+
"pp_degree": [
42+
"./autoconfig/llama7b_lora_params.json",
43+
"pipeline_parallel_degree"
44+
],
45+
"run_best_stage": {
46+
"autotuner_benchmark": [
47+
"./autoconfig/llama7b_lora_params.json",
48+
"autotuner_benchmark",
49+
0
50+
]
51+
},
52+
"search_stage": {
53+
"autotuner_benchmark": [
54+
"./autoconfig/llama7b_lora_params.json",
55+
"autotuner_benchmark",
56+
1
57+
]
58+
},
59+
"sharding_degree": [
60+
"./autoconfig/llama7b_lora_params.json",
61+
"sharding_parallel_degree"
62+
],
63+
"sharding_stage": [
64+
"./autoconfig/llama7b_lora_params.json",
65+
"sharding",
66+
"stage"
67+
],
68+
"use_recompute": [
69+
"./autoconfig/llama7b_lora_params.json",
70+
"recompute"
71+
]
72+
},
73+
"schedule_prior": [
74+
"mp4"
75+
],
76+
"sharding_degree": "auto",
77+
"sharding_stage": "auto",
78+
"task_limit": 2000,
79+
"use_recompute": "auto"
80+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"model_name_or_path": "facebook/llama-7b",
3+
"dataset_name_or_path": "./data",
4+
"output_dir": "./checkpoints/llama_lora_ckpts",
5+
"per_device_train_batch_size": 1,
6+
"gradient_accumulation_steps": 1,
7+
"per_device_eval_batch_size": 8,
8+
"eval_accumulation_steps": 16,
9+
"num_train_epochs": 1,
10+
"max_steps": 100,
11+
"learning_rate": 0.0003,
12+
"warmup_steps": 30,
13+
"logging_steps": 1,
14+
"evaluation_strategy": "no",
15+
"save_strategy": "steps",
16+
"src_length": 1024,
17+
"max_length": 2048,
18+
"bf16": true,
19+
"fp16_opt_level": "O2",
20+
"do_train": true,
21+
"do_eval": false,
22+
"disable_tqdm": true,
23+
"load_best_model_at_end": false,
24+
"eval_with_do_generation": false,
25+
"metric_for_best_model": "accuracy",
26+
"recompute": true,
27+
"save_total_limit": 1,
28+
"tensor_parallel_degree": 1,
29+
"pipeline_parallel_degree": 1,
30+
"lora": true,
31+
"zero_padding": false,
32+
"use_flash_attention": true,
33+
"sharding_parallel_degree": 8,
34+
"sharding": "stage3",
35+
"recompute_granularity": "full_attn",
36+
"autotuner_benchmark": 1,
37+
"benchmark": 1
38+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
{
2+
"dp_degree": "auto",
3+
"max_search_time": 900,
4+
"max_time_per_task": 400,
5+
"metric_cfg": {
6+
"OptimizationDirection": "Maximize",
7+
"name": "interval_samples_per_second"
8+
},
9+
"micro_batch_size": "auto",
10+
"model_cfg": {
11+
"global_batch_size": 8,
12+
"hidden_size": 5120,
13+
"num_attention_heads": 40,
14+
"num_layers": 40,
15+
"vocab_size": 32000
16+
},
17+
"mp_degree": "auto",
18+
"pp_degree": "auto",
19+
"run_cmd": {
20+
"gradient_accumulation_steps": [
21+
"./autoconfig/llama7b_pretrain_params.json",
22+
"gradient_accumulation_steps"
23+
],
24+
"micro_batch_size": [
25+
"./autoconfig/llama7b_pretrain_params.json",
26+
"per_device_train_batch_size"
27+
],
28+
"mp_degree": [
29+
"./autoconfig/llama7b_pretrain_params.json",
30+
"tensor_parallel_degree"
31+
],
32+
"pp_degree": [
33+
"./autoconfig/llama7b_pretrain_params.json",
34+
"pipeline_parallel_degree"
35+
],
36+
"run_best_stage": {
37+
"continue_training": [
38+
"./autoconfig/llama7b_pretrain_params.json",
39+
"continue_training",
40+
0
41+
],
42+
"autotuner_benchmark": [
43+
"./autoconfig/llama7b_pretrain_params.json",
44+
"autotuner_benchmark",
45+
0
46+
]
47+
},
48+
"search_stage": {
49+
"continue_training": [
50+
"./autoconfig/llama7b_pretrain_params.json",
51+
"continue_training",
52+
0
53+
],
54+
"autotuner_benchmark": [
55+
"./autoconfig/llama7b_pretrain_params.json",
56+
"autotuner_benchmark",
57+
1
58+
]
59+
},
60+
"sharding_degree": [
61+
"./autoconfig/llama7b_pretrain_params.json",
62+
"sharding_parallel_degree"
63+
],
64+
"sharding_stage": [
65+
"./autoconfig/llama7b_pretrain_params.json",
66+
"sharding",
67+
"stage"
68+
],
69+
"use_recompute": [
70+
"./autoconfig/llama7b_pretrain_params.json",
71+
"recompute"
72+
],
73+
"recompute_granularity": [
74+
"./autoconfig/llama7b_pretrain_params.json",
75+
"recompute_granularity"
76+
]
77+
},
78+
"sharding_degree": "auto",
79+
"sharding_stage": "auto",
80+
"task_limit": 2000,
81+
"use_recompute": "auto",
82+
"recompute_granularity": "auto",
83+
"invalid_strategy": ["stage3_mp*"],
84+
"schedule_prior": ["mp4"],
85+
"need_baseline": true,
86+
"mode": "Pretrain"
87+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"model_name_or_path": "facebook/llama-7b",
3+
"tokenizer_name_or_path": "facebook/llama-7b",
4+
"input_dir": "./data",
5+
"output_dir": "./checkpoints/llama_pretrain_ckpts",
6+
"per_device_train_batch_size": 1,
7+
"gradient_accumulation_steps": 8,
8+
"per_device_eval_batch_size": 2,
9+
"tensor_parallel_degree": 8,
10+
"pipeline_parallel_degree": 1,
11+
"sharding": "stage3",
12+
"virtual_pp_degree": 1,
13+
"sequence_parallel": 0,
14+
"use_flash_attention": true,
15+
"use_fused_rms_norm": true,
16+
"use_fused_rope": true,
17+
"max_seq_length": 4096,
18+
"learning_rate": 3e-05,
19+
"min_learning_rate": 3e-06,
20+
"warmup_steps": 30,
21+
"logging_steps": 1,
22+
"max_steps": 100,
23+
"save_steps": 5000,
24+
"eval_steps": 1000,
25+
"weight_decay": 0.01,
26+
"bf16": true,
27+
"fp16_opt_level": "O2",
28+
"warmup_ratio": 0.01,
29+
"max_grad_norm": 1.0,
30+
"dataloader_num_workers": 1,
31+
"continue_training": 0,
32+
"do_train": true,
33+
"do_eval": false,
34+
"do_predict": false,
35+
"disable_tqdm": true,
36+
"recompute": true,
37+
"distributed_dataloader": 1,
38+
"recompute_granularity": "full",
39+
"save_total_limit": 2,
40+
"sharding_parallel_degree": 1,
41+
"autotuner_benchmark": 1
42+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
"dp_degree": "auto",
3+
"invalid_strategy": [
4+
"stage3_mp*"
5+
],
6+
"max_search_time": 900,
7+
"max_time_per_task": 300,
8+
"metric_cfg": {
9+
"OptimizationDirection": "Maximize",
10+
"name": "interval_samples_per_second"
11+
},
12+
"micro_batch_size": "auto",
13+
"mode": "SFT",
14+
"model_cfg": {
15+
"global_batch_size": 8,
16+
"hidden_size": 4096,
17+
"num_attention_heads": 32,
18+
"num_layers": 28,
19+
"vocab_size": 65024
20+
},
21+
"mp_degree": "auto",
22+
"need_baseline": true,
23+
"pp_degree": [
24+
1
25+
],
26+
"run_cmd": {
27+
"gradient_accumulation_steps": [
28+
"./autoconfig/llama7b_sft_params.json",
29+
"gradient_accumulation_steps"
30+
],
31+
"micro_batch_size": [
32+
"./autoconfig/llama7b_sft_params.json",
33+
"per_device_train_batch_size"
34+
],
35+
"mp_degree": [
36+
"./autoconfig/llama7b_sft_params.json",
37+
"tensor_parallel_degree"
38+
],
39+
"pp_degree": [
40+
"./autoconfig/llama7b_sft_params.json",
41+
"pipeline_parallel_degree"
42+
],
43+
"run_best_stage": {
44+
"autotuner_benchmark": [
45+
"./autoconfig/llama7b_sft_params.json",
46+
"autotuner_benchmark",
47+
0
48+
]
49+
},
50+
"search_stage": {
51+
"autotuner_benchmark": [
52+
"./autoconfig/llama7b_sft_params.json",
53+
"autotuner_benchmark",
54+
1
55+
]
56+
},
57+
"sharding_degree": [
58+
"./autoconfig/llama7b_sft_params.json",
59+
"sharding_parallel_degree"
60+
],
61+
"sharding_stage": [
62+
"./autoconfig/llama7b_sft_params.json",
63+
"sharding",
64+
"stage"
65+
],
66+
"use_recompute": [
67+
"./autoconfig/llama7b_sft_params.json",
68+
"recompute"
69+
]
70+
},
71+
"schedule_prior": [
72+
"mp4"
73+
],
74+
"sharding_degree": "auto",
75+
"sharding_stage": "auto",
76+
"task_limit": 2000,
77+
"use_recompute": "auto"
78+
}

0 commit comments

Comments
 (0)