[WIP][FSDP] Support FSDP for Qwen3Next (#1116)

rucnyz · web-flow · commit b23fcd10f4ee · 2025-12-18T20:21:24.000+08:00
diff --git a/docs/zh/examples/qwen3-next-80B-A3B.md b/docs/zh/examples/qwen3-next-80B-A3B.md
@@ -0,0 +1,108 @@
+# 8xH100 训练 Qwen3-30B-A3B
+
+## 环境准备
+
+搭建环境、下载模型、数据与 ckpt 转换均与 Qwen3-4B 模型相同，可以参考 [示例：Qwen3-4B](./qwen3-4B.md)，将文中 Qwen3-4B 的部分转换为
+Qwen3-next-80B-A3B-Instruct 即可。
+
+可以用如下完整方法把 huggingface checkpoint 转化为 torch_dist 格式：
+
+```bash
+export BASE_FOLDER=./models/
+# 下载模型权重 (Qwen3-Next-80B-A3B-Thinking)
+hf download Qwen/Qwen3-Next-80B-A3B-Thinking --local-dir ${BASE_FOLDER}/Qwen3-Next-80B-A3B-Thinking
+```
+
+```shell
+cd slime/
+pip install -e .
+
+# (for acceleration) 
+cd .. # and find a proper folder
+git clone https://github.com/fla-org/flash-linear-attention
+cd flash-linear-attention
+git checkout 9714c595
+pip install -e .
+
+wget https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.4/causal_conv1d-1.5.4+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+pip install ./causal_conv1d-1.5.4+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+```
+
+## [Optional] Fix a bug in triton compilation on Blackwell (sm100)
+
+see discussion here https://github.com/triton-lang/triton/issues/8695
+and https://github.com/fla-org/flash-linear-attention/issues/638
+
+We need to apply a patch to fix the bug.
+Go to the flash-linear-attention folder you just installed, and apply the following patch:
+
+```diff
+diff --git a/fla/ops/gated_delta_rule/wy_fast.py b/fla/ops/gated_delta_rule/wy_fast.py
+index c5119dcf..838f5e4e 100644
+--- a/fla/ops/gated_delta_rule/wy_fast.py
++++ b/fla/ops/gated_delta_rule/wy_fast.py
+@@ -198,7 +198,14 @@ def prepare_wy_repr_bwd_kernel(
+         b_A += tl.dot(b_kb, tl.trans(b_k))
+         b_dkb = tl.dot(b_dA, b_k)
+         b_db += tl.sum(b_dkb * b_k, 1)
+-        b_dk += tl.dot(tl.trans(b_dA), b_kb)
++        b_dk += tl.inline_asm_elementwise(
++            asm="mov.f32 $0, $1;",
++            constraints="=r,r",
++            args=[tl.dot(tl.trans(b_dA), b_kb)],
++            dtype=tl.float32,
++            is_pure=True,
++            pack=1,
++        )
+         b_dk += b_dkb * b_b[:, None]
+         tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+     tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0,))
+
+```
+
+save it as `patch.diff` (Please remember to copy the last empty line to the file!) and do `git apply patch.diff`
+
+## 执行训练 (Megatron)
+
+**当前暂不支持Blackwell**
+
+转换模型权重：
+
+```bash
+source scripts/models/qwen3-next-80B-A3B.sh
+PYTHONPATH=/root/Megatron-LM/ torchrun --nproc-per-node 8 \
+   tools/convert_hf_to_torch_dist.py \
+   ${MODEL_ARGS[@]} \
+   --hf-checkpoint /root/Qwen3-Next-80B-A3B-Thinking/ \
+   --save /root/Qwen3-Next-80B-A3B-Thinking_torch_dist/
+```
+
+单机8卡
+
+```bash
+cd /root/slime
+export BASE_FOLDER=/root
+export MASTER_ADDR=127.0.0.1
+bash scripts/run-qwen3-next-80B-A3B-8gpus.sh 
+```
+如果显存不够，考虑disable `--accumulate-allreduce-grads-in-fp32`，enable `--grad-reduce-in-bf16`
+
+
+多机（4x8）
+
+```bash
+cd /root/slime
+export BASE_FOLDER=/root
+export MASTER_ADDR=your_master_addr
+bash scripts/run-qwen3-next-80B-A3B.sh 
+```
+
+## 执行训练 (FSDP)
+
+```bash
+export BASE_FOLDER=./models/
+export MASTER_ADDR=127.0.0.1
+
+bash scripts/run-qwen3-next-80B-A3B-fsdp.sh
+```
+
diff --git a/scripts/run-qwen3-next-80B-A3B-8gpus.sh b/scripts/run-qwen3-next-80B-A3B-8gpus.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# if base folder not set raise error
+if [ -z "${BASE_FOLDER}" ]; then
+  echo "BASE_FOLDER is not set. Please set it to the base directory of your checkpoints."
+  exit 1
+fi
+
+if [ -z "${MASTER_ADDR}" ]; then
+  echo "MASTER_ADDR is not set. Please set it to the master node address."
+  exit 1
+fi
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/models/qwen3-next-80B-A3B.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint ${BASE_FOLDER}/Qwen3-Next-80B-A3B-Thinking
+   --ref-load ${BASE_FOLDER}/Qwen3-Next-80B-A3B-Thinking_torch_dist
+   --load ${BASE_FOLDER}/Qwen3-Next-80B-A3B-Thinking_slime/
+   --save ${BASE_FOLDER}/Qwen3-Next-80B-A3B-Thinking_slime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data ${BASE_FOLDER}/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type deepscaler
+   --num-rollout 300
+   --rollout-batch-size 16
+   --n-samples-per-prompt 4
+   --rollout-max-response-len 8192
+   --rollout-temperature 0.8
+
+   --global-batch-size 64
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 20
+   --eval-prompt-data aime ${BASE_FOLDER}/aime-2024/aime-2024.jsonl
+   --n-samples-per-eval-prompt 2
+   --eval-max-response-len 16384
+   --eval-top-p 0.7
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 1
+   --sequence-parallel
+   --pipeline-model-parallel-size 6
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 2048
+)
+
+GRPO_ARGS=(
+   --advantage-estimator gspo
+   #--use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --kl-coef 0.00
+   --entropy-coef 0.00
+   --eps-clip 4e-4
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+#   --use-wandb
+#    --wandb-project slime-dev
+#    --wandb-group qwen3-next-80B-A3B-test
+#    --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 2
+   --rollout-num-gpus 2
+   --sglang-mem-fraction-static 0.8
+   --sglang-ep-size 1
+   
+   --sglang-cuda-graph-bs 1 2 4 8 $(seq 16 8 128)
+
+   # mtp
+#   --sglang-speculative-algorithm EAGLE
+#   --sglang-speculative-num-steps 2
+#   --sglang-speculative-eagle-topk 1
+#   --sglang-speculative-num-draft-tokens 3
+#   --sglang-enable-draft-weights-cpu-backup
+#
+#   --sglang-max-running-requests 512
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+#   --grad-reduce-in-bf16
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+
+   --moe-token-dispatcher-type alltoall
+#   --moe-enable-deepep
+#   --debug-rollout-only
+)
+
+# launch the master node of ray in container
+export no_proxy="127.0.0.1,${MASTER_ADDR}"
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+for WORKER_IP in $(awk '{print $1}' /root/mpi_rack_hostfile); do
+  if [[ "$WORKER_IP" == "$MLP_WORKER_0_HOST" ]]; then
+    continue
+  fi
+  echo "Starting Ray worker on ${WORKER_IP}"
+  ssh root@"${WORKER_IP}" \
+    "pkill -9 sglang ; ray stop --force ; pkill -9 python ; ray start --address=${MASTER_ADDR}:6379 --num-gpus 8 --node-ip-address ${WORKER_IP} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265" &
+done
+wait
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"no_proxy\": \"${no_proxy}\",
+    \"MASTER_ADDR\": \"${MASTER_ADDR}\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 6 \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/scripts/run-qwen3-next-80B-A3B-fsdp.sh b/scripts/run-qwen3-next-80B-A3B-fsdp.sh