|
80 | 80 | # Common args |
81 | 81 | CKPT_ARGS=( |
82 | 82 | --hf-checkpoint /root/models/${MODEL_NAME} |
83 | | - # vl model has rotary base 5000000 |
84 | | - --rotary-base 5000000 |
85 | 83 | ) |
86 | 84 |
|
87 | 85 | ROLLOUT_ARGS=( |
@@ -154,41 +152,43 @@ MISC_ARGS=( |
154 | 152 | # Backend-specific args |
155 | 153 | if [ "$TRAIN_BACKEND" = "fsdp" ]; then |
156 | 154 | BACKEND_ARGS=( |
157 | | - --train-backend fsdp |
158 | | - --gradient-checkpointing |
159 | | - --sglang-attention-backend fa3 |
160 | | - --attn-implementation flash_attention_3 |
161 | | - --update-weight-buffer-size 536870912 |
| 155 | + --train-backend fsdp |
| 156 | + --gradient-checkpointing |
| 157 | + --sglang-attention-backend fa3 |
| 158 | + --attn-implementation flash_attention_3 |
| 159 | + --update-weight-buffer-size 536870912 |
162 | 160 | ) |
163 | 161 | MODEL_ARGS=() |
164 | 162 | else |
165 | 163 | # megatron backend (default) |
166 | 164 | BACKEND_ARGS=( |
167 | | - --train-backend megatron |
168 | | - --load /root/models/${MODEL_NAME} |
169 | | - --tensor-model-parallel-size 4 |
170 | | - --sequence-parallel |
171 | | - --pipeline-model-parallel-size 1 |
172 | | - --context-parallel-size 1 |
173 | | - --expert-model-parallel-size 1 |
174 | | - --expert-tensor-parallel-size 1 |
175 | | - --recompute-granularity full |
176 | | - --recompute-method uniform |
177 | | - --recompute-num-layers 1 |
178 | | - --use-dynamic-batch-size |
179 | | - --max-tokens-per-gpu 4096 |
180 | | - --attention-dropout 0.0 |
181 | | - --hidden-dropout 0.0 |
182 | | - --accumulate-allreduce-grads-in-fp32 |
183 | | - --attention-softmax-in-fp32 |
184 | | - --attention-backend flash |
185 | | - --megatron-to-hf-mode bridge |
| 165 | + --train-backend megatron |
| 166 | + --load /root/models/${MODEL_NAME} |
| 167 | + --tensor-model-parallel-size 4 |
| 168 | + --sequence-parallel |
| 169 | + --pipeline-model-parallel-size 1 |
| 170 | + --context-parallel-size 1 |
| 171 | + --expert-model-parallel-size 1 |
| 172 | + --expert-tensor-parallel-size 1 |
| 173 | + --recompute-granularity full |
| 174 | + --recompute-method uniform |
| 175 | + --recompute-num-layers 1 |
| 176 | + --use-dynamic-batch-size |
| 177 | + --max-tokens-per-gpu 4096 |
| 178 | + --attention-dropout 0.0 |
| 179 | + --hidden-dropout 0.0 |
| 180 | + --accumulate-allreduce-grads-in-fp32 |
| 181 | + --attention-softmax-in-fp32 |
| 182 | + --attention-backend flash |
| 183 | + --megatron-to-hf-mode bridge |
186 | 184 | ) |
187 | 185 |
|
188 | 186 | # get MODEL_ARGS from scripts/models for megatron backend |
189 | 187 | SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)" |
190 | 188 | MODEL_ARGS_FILE=$(echo "$MODEL_NAME" | sed 's/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g') |
191 | | - source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh" |
| 189 | + # VL models require rotary-base 5000000 |
| 190 | + MODEL_ARGS_ROTARY_BASE=5000000 source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh" |
| 191 | + |
192 | 192 | fi |
193 | 193 |
|
194 | 194 | # Start Ray if not using external Ray |
|
0 commit comments