Support zero3 (#353)

Jintao-Huang · web-flow · commit e8ff04d73051 · 2024-01-30T16:47:22.000+08:00
diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 
 
 ## 🎉 News
+- 2024.1.30: Support ZeRO-3, just need to specify `--deepspeed_config_path default-zero3`.
 - 2024.1.29: Support internlm2-math series: internlm2-math-7b, internlm2-math-7b-chat, internlm2-math-20b, internlm2-math-20b-chat.
 - 🔥2024.1.26: Support [yi-vl-6b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_vl_6b_chat), yi-vl-34b-chat.
 - 2024.1.24: Support codefuse-codegeex2-6b-chat, codefuse-qwen-14b-chat.
diff --git a/README_CN.md b/README_CN.md
@@ -60,6 +60,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 用户可以查看 [SWIFT官方文档](docs/source/GetStarted/快速使用.md) 来了解详细信息。
 
 ## 🎉 新闻
+- 2024.1.30: 支持ZeRO-3, 只需要指定`--deepspeed_config_path default-zero3`即可.
 - 2024.1.29: 支持internlm2-math系列: internlm2-math-7b, internlm2-math-7b-chat, internlm2-math-20b, internlm2-math-20b-chat.
 - 2024.1.26: 支持[yi-vl-6b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_vl_6b_chat), yi-vl-34b-chat.
 - 2024.1.24: 支持codefuse-codegeex2-6b-chat, codefuse-qwen-14b-chat.
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -49,7 +49,7 @@
 - `--lora_dtype`: 默认为`'fp32'`, 指定lora模块的dtype类型. 如果是`AUTO`则跟随原始模块的dtype类型. 你可以选择的值: 'fp16', 'bf16', 'fp32', 'AUTO'.
 - `--neftune_noise_alpha`: `NEFTune`添加的噪声系数, 可以提升模型在指令微调中的性能, 默认为`None`. 通常可以设置为5, 10, 15. 你可以查看[相关论文](https://arxiv.org/abs/2310.05914).
 - `--gradient_checkpointing`: 是否开启gradient checkpointing, 默认为`True`. 该参数可以用于节约显存, 虽然这会略微降低训练速度. 该参数在max_length较大, batch_size较大时作用显著.
-- `--deepspeed_config_path`: 用于指定deepspeed的配置文件的路径, 默认为`None`, 即不开启deepspeed. deepspeed可以节约显存. 我们书写了默认的[ZeRO-2的配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero2.json), 你只需要指定'default-zero2', 就会使用默认zero2配置文件.
+- `--deepspeed_config_path`: 用于指定deepspeed的配置文件的路径, 默认为`None`, 即不开启deepspeed. deepspeed可以节约显存. 我们书写了默认的[ZeRO-2配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero2.json), [ZeRO-3配置文件](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/ds_config/zero3.json). 你只需要指定'default-zero2', 就会使用默认zero2配置文件; 指定'default-zero3', 就会使用默认的zero3配置文件.
 - `--batch_size`: 训练时的batch_size, 默认为`1`. 增大batch_size可以增加GPU的利用率, 但不一定会增加训练速度, 因为在一个batch中, 需要对较短的句子按该batch中最长句子的长度进行padding, 从而引入无效的计算量.
 - `--eval_batch_size`: 评估时的batch_size, 默认为`None`, 即当`predict_with_generate`为True时, 设置为1, 为False时, 设置为`batch_size`.
 - `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`. 通常情况下设置为3 ~ 5.
diff --git a/examples/pytorch/llm/ds_config/zero2.json b/examples/pytorch/llm/ds_config/zero2.json
@@ -12,10 +12,29 @@
         "enabled": "auto"
     },
 
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
     "zero_optimization": {
         "stage": 2,
         "offload_optimizer": {
-            "device": "cpu",
+            "device": "none",
             "pin_memory": true
         },
         "allgather_partitions": true,
diff --git a/examples/pytorch/llm/ds_config/zero3.json b/examples/pytorch/llm/ds_config/zero3.json
@@ -0,0 +1,61 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/swift/llm/ds_config/zero2.json b/swift/llm/ds_config/zero2.json
@@ -12,10 +12,29 @@
         "enabled": "auto"
     },
 
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
     "zero_optimization": {
         "stage": 2,
         "offload_optimizer": {
-            "device": "cpu",
+            "device": "none",
             "pin_memory": true
         },
         "allgather_partitions": true,
diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json
@@ -0,0 +1,61 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -175,9 +175,13 @@ class SftArguments:
 
     def __post_init__(self) -> None:
         handle_compatibility(self)
+        ds_config_folder = os.path.join(__file__, '..', '..', 'ds_config')
         if self.deepspeed_config_path == 'default-zero2':
             self.deepspeed_config_path = os.path.abspath(
-                os.path.join(__file__, '..', '..', 'ds_config', 'zero2.json'))
+                os.path.join(ds_config_folder, 'zero2.json'))
+        elif self.deepspeed_config_path == 'default-zero3':
+            self.deepspeed_config_path = os.path.abspath(
+                os.path.join(ds_config_folder, 'zero3.json'))
         handle_path(self)
         set_model_type(self)
         if isinstance(self.dataset, str):