Skip to content

Commit e8ff04d

Browse files
authored
Support zero3 (#353)
1 parent 12e731a commit e8ff04d

File tree

8 files changed

+170
-4
lines changed

8 files changed

+170
-4
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
6262

6363

6464
## 🎉 News
65+
- 2024.1.30: Support ZeRO-3, just need to specify `--deepspeed_config_path default-zero3`.
6566
- 2024.1.29: Support internlm2-math series: internlm2-math-7b, internlm2-math-7b-chat, internlm2-math-20b, internlm2-math-20b-chat.
6667
- 🔥2024.1.26: Support [yi-vl-6b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_vl_6b_chat), yi-vl-34b-chat.
6768
- 2024.1.24: Support codefuse-codegeex2-6b-chat, codefuse-qwen-14b-chat.

README_CN.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展
6060
用户可以查看 [SWIFT官方文档](docs/source/GetStarted/快速使用.md) 来了解详细信息。
6161

6262
## 🎉 新闻
63+
- 2024.1.30: 支持ZeRO-3, 只需要指定`--deepspeed_config_path default-zero3`即可.
6364
- 2024.1.29: 支持internlm2-math系列: internlm2-math-7b, internlm2-math-7b-chat, internlm2-math-20b, internlm2-math-20b-chat.
6465
- 2024.1.26: 支持[yi-vl-6b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_vl_6b_chat), yi-vl-34b-chat.
6566
- 2024.1.24: 支持codefuse-codegeex2-6b-chat, codefuse-qwen-14b-chat.

docs/source/LLM/命令行参数.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
- `--lora_dtype`: 默认为`'fp32'`, 指定lora模块的dtype类型. 如果是`AUTO`则跟随原始模块的dtype类型. 你可以选择的值: 'fp16', 'bf16', 'fp32', 'AUTO'.
5050
- `--neftune_noise_alpha`: `NEFTune`添加的噪声系数, 可以提升模型在指令微调中的性能, 默认为`None`. 通常可以设置为5, 10, 15. 你可以查看[相关论文](https://arxiv.org/abs/2310.05914).
5151
- `--gradient_checkpointing`: 是否开启gradient checkpointing, 默认为`True`. 该参数可以用于节约显存, 虽然这会略微降低训练速度. 该参数在max_length较大, batch_size较大时作用显著.
52-
- `--deepspeed_config_path`: 用于指定deepspeed的配置文件的路径, 默认为`None`, 即不开启deepspeed. deepspeed可以节约显存. 我们书写了默认的[ZeRO-2的配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero2.json), 你只需要指定'default-zero2', 就会使用默认zero2配置文件.
52+
- `--deepspeed_config_path`: 用于指定deepspeed的配置文件的路径, 默认为`None`, 即不开启deepspeed. deepspeed可以节约显存. 我们书写了默认的[ZeRO-2配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero2.json), [ZeRO-3配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero3.json). 你只需要指定'default-zero2', 就会使用默认zero2配置文件; 指定'default-zero3', 就会使用默认的zero3配置文件.
5353
- `--batch_size`: 训练时的batch_size, 默认为`1`. 增大batch_size可以增加GPU的利用率, 但不一定会增加训练速度, 因为在一个batch中, 需要对较短的句子按该batch中最长句子的长度进行padding, 从而引入无效的计算量.
5454
- `--eval_batch_size`: 评估时的batch_size, 默认为`None`, 即当`predict_with_generate`为True时, 设置为1, 为False时, 设置为`batch_size`.
5555
- `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`. 通常情况下设置为3 ~ 5.

examples/pytorch/llm/ds_config/zero2.json

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,29 @@
1212
"enabled": "auto"
1313
},
1414

15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto"
31+
}
32+
},
33+
1534
"zero_optimization": {
1635
"stage": 2,
1736
"offload_optimizer": {
18-
"device": "cpu",
37+
"device": "none",
1938
"pin_memory": true
2039
},
2140
"allgather_partitions": true,
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
11+
"bf16": {
12+
"enabled": "auto"
13+
},
14+
15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto"
31+
}
32+
},
33+
34+
"zero_optimization": {
35+
"stage": 3,
36+
"offload_optimizer": {
37+
"device": "none",
38+
"pin_memory": true
39+
},
40+
"offload_param": {
41+
"device": "none",
42+
"pin_memory": true
43+
},
44+
"overlap_comm": true,
45+
"contiguous_gradients": true,
46+
"sub_group_size": 1e9,
47+
"reduce_bucket_size": "auto",
48+
"stage3_prefetch_bucket_size": "auto",
49+
"stage3_param_persistence_threshold": "auto",
50+
"stage3_max_live_parameters": 1e9,
51+
"stage3_max_reuse_distance": 1e9,
52+
"stage3_gather_16bit_weights_on_model_save": true
53+
},
54+
55+
"gradient_accumulation_steps": "auto",
56+
"gradient_clipping": "auto",
57+
"steps_per_print": 2000,
58+
"train_batch_size": "auto",
59+
"train_micro_batch_size_per_gpu": "auto",
60+
"wall_clock_breakdown": false
61+
}

swift/llm/ds_config/zero2.json

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,29 @@
1212
"enabled": "auto"
1313
},
1414

15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto"
31+
}
32+
},
33+
1534
"zero_optimization": {
1635
"stage": 2,
1736
"offload_optimizer": {
18-
"device": "cpu",
37+
"device": "none",
1938
"pin_memory": true
2039
},
2140
"allgather_partitions": true,

swift/llm/ds_config/zero3.json

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
11+
"bf16": {
12+
"enabled": "auto"
13+
},
14+
15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto"
31+
}
32+
},
33+
34+
"zero_optimization": {
35+
"stage": 3,
36+
"offload_optimizer": {
37+
"device": "none",
38+
"pin_memory": true
39+
},
40+
"offload_param": {
41+
"device": "none",
42+
"pin_memory": true
43+
},
44+
"overlap_comm": true,
45+
"contiguous_gradients": true,
46+
"sub_group_size": 1e9,
47+
"reduce_bucket_size": "auto",
48+
"stage3_prefetch_bucket_size": "auto",
49+
"stage3_param_persistence_threshold": "auto",
50+
"stage3_max_live_parameters": 1e9,
51+
"stage3_max_reuse_distance": 1e9,
52+
"stage3_gather_16bit_weights_on_model_save": true
53+
},
54+
55+
"gradient_accumulation_steps": "auto",
56+
"gradient_clipping": "auto",
57+
"steps_per_print": 2000,
58+
"train_batch_size": "auto",
59+
"train_micro_batch_size_per_gpu": "auto",
60+
"wall_clock_breakdown": false
61+
}

swift/llm/utils/argument.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,13 @@ class SftArguments:
175175

176176
def __post_init__(self) -> None:
177177
handle_compatibility(self)
178+
ds_config_folder = os.path.join(__file__, '..', '..', 'ds_config')
178179
if self.deepspeed_config_path == 'default-zero2':
179180
self.deepspeed_config_path = os.path.abspath(
180-
os.path.join(__file__, '..', '..', 'ds_config', 'zero2.json'))
181+
os.path.join(ds_config_folder, 'zero2.json'))
182+
elif self.deepspeed_config_path == 'default-zero3':
183+
self.deepspeed_config_path = os.path.abspath(
184+
os.path.join(ds_config_folder, 'zero3.json'))
181185
handle_path(self)
182186
set_model_type(self)
183187
if isinstance(self.dataset, str):

0 commit comments

Comments
 (0)