|
| 1 | +# Elastic |
| 2 | + |
| 3 | + |
| 4 | + |
| 5 | +## 安装依赖 |
| 6 | + |
| 7 | +集群部署K8S,并在集群中部署DLrover,[DLRover](https://github.com/intelligent-machine-learning/dlrover), |
| 8 | +`pip install dlrover && pip install tornado && pip install kubernetes && pip install ms-swift` |
| 9 | + |
| 10 | +经过反复测试验证的训练镜像中的其它依赖以及版本: |
| 11 | +deepspeed 0.16.5(需参考https://github.com/deepspeedai/DeepSpeed/pull/7585/files 修复universal checkpoint 相关问题) |
| 12 | +pytorch 2.6.0 |
| 13 | + |
| 14 | + |
| 15 | +## 如何启动 |
| 16 | + |
| 17 | +通过在`--callbacks`中添加`deepspeed_elastic`(可选`graceful_exit`)启用弹性训练,并配置DeepSpeed弹性参数。 |
| 18 | +命令组成=dlrover-run +dlrover 命令参数+swift 启动命令 +swift参数,dlrover-run除自定义的参数外,其他参数与torchrun一致; |
| 19 | +dlrover-run 参数如下: |
| 20 | +``` |
| 21 | +usage: dlrover-run [-h] [--nnodes NNODES] [--nproc-per-node NPROC_PER_NODE] |
| 22 | + [--rdzv-backend RDZV_BACKEND] [--rdzv-endpoint RDZV_ENDPOINT] [--rdzv-id RDZV_ID] |
| 23 | + [--rdzv-conf RDZV_CONF] [--standalone] [--max-restarts MAX_RESTARTS] |
| 24 | + [--monitor-interval MONITOR_INTERVAL] [--start-method {spawn,fork,forkserver}] |
| 25 | + [--role ROLE] [-m] [--no-python] [--run-path] [--log-dir LOG_DIR] [-r REDIRECTS] |
| 26 | + [-t TEE] [--local-ranks-filter LOCAL_RANKS_FILTER] [--node-rank NODE_RANK] |
| 27 | + [--master-addr MASTER_ADDR] [--master-port MASTER_PORT] [--local-addr LOCAL_ADDR] |
| 28 | + [--logs-specs LOGS_SPECS] [--precheck {0,1,2}] [--node_unit NODE_UNIT] |
| 29 | + [--auto_config] [--auto_tunning] [--exclude-straggler] [--save_at_breakpoint] |
| 30 | + [--accelerator {nvidia.com/gpu,ascend-npu}] [--training_port TRAINING_PORT] |
| 31 | + [--switchbox-check] [--box-pairs PAIR [PAIR ...]] [--min-bandwidth MIN_BANDWIDTH] |
| 32 | + [--min-channels MIN_CHANNELS] [--numa-affinity] [--network-check] |
| 33 | + [--comm-perf-test] [--ucp_device_type UCP_DEVICE_TYPE] |
| 34 | + training_script |
| 35 | +
|
| 36 | +``` |
| 37 | +在弹性训练中我们需要关注的参数为: |
| 38 | + |
| 39 | +--nnodes NNODES Number of nodes, or the range of nodes in form |
| 40 | + <minimum_nodes>:<maximum_nodes>. |
| 41 | + |
| 42 | +--nproc-per-node NPROC_PER_NODE Number of processes per node. |
| 43 | +示例: |
| 44 | + |
| 45 | +```bash |
| 46 | +model=your model path |
| 47 | +dataset=your dataset |
| 48 | +output= your output dir |
| 49 | +export CUDA_VISIBLE_DEVICES=0 根据实际使用的GPU情况设置 |
| 50 | +deepspeed_config_or_type=deepspeed类型或者配置文件的路径,如 zero1 或者/xxx/ms-swift/swift/llm/ds_config/zero1.json |
| 51 | + |
| 52 | +dlrover-run --nnodes 1:$NODE_NUM --nproc_per_node=1 \ |
| 53 | +/opt/conda/lib/python3.10/site-packages/swift/cli/sft.py --model $model \ |
| 54 | +--model_type qwen3 \ |
| 55 | +--train_type lora \ |
| 56 | +--torch_dtype bfloat16 \ |
| 57 | +--dataset $dataset \ |
| 58 | +--num_train_epochs 4 \ |
| 59 | +--per_device_train_batch_size 1 \ |
| 60 | +--per_device_eval_batch_size 1 \ |
| 61 | +--learning_rate 5e-7 \ |
| 62 | +--gradient_accumulation_steps 8 \ |
| 63 | +--eval_steps 500 \ |
| 64 | +--save_steps 10 \ |
| 65 | +--save_total_limit 20 \ |
| 66 | +--logging_steps 1 \ |
| 67 | +--output_dir $output \ |
| 68 | +--warmup_ratio 0.01 \ |
| 69 | +--dataloader_num_workers 4 \ |
| 70 | +--temperature 1.0 \ |
| 71 | +--system You\ are\ a\ helpful\ assistant. \ |
| 72 | +--lora_rank 8 \ |
| 73 | +--lora_alpha 32 \ |
| 74 | +--target_modules all-linear \ |
| 75 | +--dataset_num_proc 1 \ |
| 76 | +--use_flash_ckpt true \ |
| 77 | +--callbacks deepspeed_elastic graceful_exit \ |
| 78 | +--deepspeed $deepspeed_config_or_type \ |
| 79 | +``` |
| 80 | + |
| 81 | +## 配置文件示例 |
| 82 | +默认情况下的zero1为以下示例配置, |
| 83 | + |
| 84 | +```json |
| 85 | +{ |
| 86 | + "fp16": { |
| 87 | + "enabled": "auto", |
| 88 | + "loss_scale": 0, |
| 89 | + "loss_scale_window": 1000, |
| 90 | + "initial_scale_power": 16, |
| 91 | + "hysteresis": 2, |
| 92 | + "min_loss_scale": 1 |
| 93 | + }, |
| 94 | + |
| 95 | + "bf16": { |
| 96 | + "enabled": "auto" |
| 97 | + }, |
| 98 | + |
| 99 | + "zero_optimization": { |
| 100 | + "stage": 1, |
| 101 | + "offload_optimizer": { |
| 102 | + "device": "none", |
| 103 | + "pin_memory": true |
| 104 | + }, |
| 105 | + "allgather_partitions": true, |
| 106 | + "allgather_bucket_size": 2e8, |
| 107 | + "overlap_comm": false, |
| 108 | + "reduce_scatter": true, |
| 109 | + "reduce_bucket_size": 2e8, |
| 110 | + "contiguous_gradients": true |
| 111 | + }, |
| 112 | + |
| 113 | + "gradient_accumulation_steps": "auto", |
| 114 | + "gradient_clipping": "auto", |
| 115 | + "steps_per_print": 2000, |
| 116 | + "train_batch_size": "auto", |
| 117 | + "train_micro_batch_size_per_gpu": "auto", |
| 118 | + "wall_clock_breakdown": false, |
| 119 | + "elasticity": { |
| 120 | + "ignore_non_elastic_batch_info": true, |
| 121 | + "enabled": true, |
| 122 | + "max_train_batch_size": 8, |
| 123 | + "micro_batch_sizes": [ |
| 124 | + 4, |
| 125 | + 2 |
| 126 | + ], |
| 127 | + "min_gpus": 1, |
| 128 | + "max_gpus": 4, |
| 129 | + "min_time": 20, |
| 130 | + "version": 0.1 |
| 131 | + } |
| 132 | +} |
| 133 | +``` |
| 134 | + |
| 135 | +如果用户需要自定义,可以在启动命令中deepspeed_config_or_type指定自定义的zero1.json的存放路径,其中弹性相关的配置为: |
| 136 | +```json |
| 137 | +... |
| 138 | + |
| 139 | + "elasticity": { |
| 140 | + "ignore_non_elastic_batch_info": true, |
| 141 | + "enabled": true, |
| 142 | + "max_train_batch_size": 8, |
| 143 | + "micro_batch_sizes": [ |
| 144 | + 4, |
| 145 | + 2 |
| 146 | + ], |
| 147 | + "min_gpus": 1, |
| 148 | + "max_gpus": 4, |
| 149 | + "min_time": 20, |
| 150 | + "version": 0.1 |
| 151 | + } |
| 152 | +``` |
| 153 | + |
| 154 | +- ignore_non_elastic_batch_info:代表在elasticity里的配置会忽略外层的batch_size相关的配置,训练过程中会根据实际的训练进程个数实时修改batch_size等相关的参数 |
| 155 | +计算原则为: |
| 156 | + global-training-batch-size = micro-batch-size * gradient-accumulation-steps * world-size |
| 157 | +- max_train_batch_size:最大batch_size数 |
| 158 | +- micro_batch_sizes:elasticity下允许的每卡micro-batch size列表,相当于train_micro_batch_size_per_gpu的候选值 |
| 159 | +- min_gpus:最小gpu数目 |
| 160 | +- max_gpus:最大gpu数目 |
| 161 | +更详细的内容见:[Deepspeed](https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02) |
| 162 | + |
| 163 | + |
| 164 | +## 启动训练 |
| 165 | + |
| 166 | +```yaml |
| 167 | +--- |
| 168 | +apiVersion: elastic.iml.github.io/v1alpha1 |
| 169 | +kind: ElasticJob |
| 170 | +metadata: |
| 171 | + name: deepspeed-elastic-swift |
| 172 | + namespace: dlrover |
| 173 | +spec: |
| 174 | + distributionStrategy: AllreduceStrategy |
| 175 | + optimizeMode: single-job |
| 176 | + replicaSpecs: |
| 177 | + worker: |
| 178 | + replicas: 1 #【这里需要与启动命令中的--nnodes NNODES的最大值一致】 |
| 179 | + template: |
| 180 | + spec: |
| 181 | + restartPolicy: Never |
| 182 | + containers: |
| 183 | + - name: main |
| 184 | + image: #【训练镜像,需要安装deepspeed,dlrover 和swift 】 |
| 185 | + imagePullPolicy: IfNotPresent |
| 186 | + command: |
| 187 | + - /bin/bash |
| 188 | + - -c |
| 189 | + - sh start.sh # 启动脚本 |
| 190 | + resources: |
| 191 | + limits: |
| 192 | + cpu: '8' |
| 193 | + memory: 16Gi |
| 194 | + nvidia.com/gpu: '1' |
| 195 | + volumeMounts: |
| 196 | + - mountPath: /model |
| 197 | + name: volume-model |
| 198 | + - mountPath: /dev/shm |
| 199 | + name: volume-shm |
| 200 | + restartPolicy: Never |
| 201 | + volumes: |
| 202 | + - hostPath: |
| 203 | + path: /model |
| 204 | + type: Directory |
| 205 | + name: volume-model |
| 206 | + - emptyDir: |
| 207 | + medium: Memory |
| 208 | + sizeLimit: 200Gi |
| 209 | + name: volume-shm |
| 210 | + |
| 211 | +``` |
0 commit comments