|
| 1 | +import os |
| 2 | + |
| 3 | +import slime.utils.misc as U |
| 4 | +from slime.utils.external_utils.command_utils import execute_train, get_default_wandb_args |
| 5 | + |
| 6 | +MODEL_NAME = os.environ.get("SLIME_SCRIPT_MODEL_NAME", "Qwen3-VL-2B-Instruct") |
| 7 | +assert MODEL_NAME in {"Qwen2.5-VL-3B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct"} |
| 8 | + |
| 9 | +NUM_GPUS = int(os.environ.get("SLIME_SCRIPT_NUM_GPUS", "1")) |
| 10 | +EXTERNAL_RAY = int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0")) |
| 11 | +MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1") |
| 12 | + |
| 13 | + |
| 14 | +def prepare(): |
| 15 | + U.exec_command("mkdir -p /root/models /root/datasets") |
| 16 | + U.exec_command(f"hf download Qwen/{MODEL_NAME} --local-dir /root/models/{MODEL_NAME}") |
| 17 | + dataset_name = "chenhegu/geo3k_imgurl" |
| 18 | + _, partial_name = dataset_name.split("/") |
| 19 | + U.exec_command(f"hf download --repo-type dataset {dataset_name} --local-dir /root/datasets/{partial_name}") |
| 20 | + |
| 21 | + |
| 22 | +def execute(): |
| 23 | + ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} " |
| 24 | + |
| 25 | + rollout_args = ( |
| 26 | + "--prompt-data /root/datasets/geo3k_imgurl/train.parquet " |
| 27 | + "--input-key problem " |
| 28 | + "--label-key answer " |
| 29 | + '--multimodal-keys \'{"image": "images"}\' ' |
| 30 | + "--apply-chat-template " |
| 31 | + "--rollout-shuffle " |
| 32 | + "--rm-type math " |
| 33 | + "--num-rollout 3000 " |
| 34 | + "--rollout-batch-size 64 " |
| 35 | + "--n-samples-per-prompt 8 " |
| 36 | + "--rollout-max-response-len 4096 " |
| 37 | + "--rollout-temperature 0.8 " |
| 38 | + "--global-batch-size 512 " |
| 39 | + ) |
| 40 | + |
| 41 | + eval_args = ( |
| 42 | + # "--eval-interval 20 " |
| 43 | + "--eval-prompt-data geo3k /root/datasets/geo3k_imgurl/test.parquet " |
| 44 | + "--n-samples-per-eval-prompt 1 " |
| 45 | + "--eval-max-response-len 4096 " |
| 46 | + "--eval-top-k 1 " |
| 47 | + ) |
| 48 | + |
| 49 | + grpo_args = ( |
| 50 | + "--advantage-estimator grpo " |
| 51 | + # "--use-kl-loss " |
| 52 | + "--kl-loss-coef 0.00 " |
| 53 | + "--kl-loss-type low_var_kl " |
| 54 | + "--kl-coef 0.00 " |
| 55 | + "--entropy-coef 0.00 " |
| 56 | + "--eps-clip 0.2 " |
| 57 | + "--eps-clip-high 0.28 " |
| 58 | + ) |
| 59 | + |
| 60 | + optimizer_args = ( |
| 61 | + "--optimizer adam " |
| 62 | + "--lr 1e-6 " |
| 63 | + "--lr-decay-style constant " |
| 64 | + "--weight-decay 0.1 " |
| 65 | + "--adam-beta1 0.9 " |
| 66 | + "--adam-beta2 0.98 " |
| 67 | + ) |
| 68 | + |
| 69 | + sglang_args = ( |
| 70 | + "--rollout-num-gpus-per-engine 1 " |
| 71 | + "--sglang-mem-fraction-static 0.6 " |
| 72 | + f"--sglang-cuda-graph-bs {' '.join(map(str, [1, 2, 4, 8] + list(range(16, 257, 8))))} " |
| 73 | + ) |
| 74 | + |
| 75 | + fsdp_args = ( |
| 76 | + # Set to true for FULL_STATE_DICT mode, false for SHARDED_STATE_DICT mode (default) |
| 77 | + # "--fsdp-full-params " # Uncomment this line to enable full params mode |
| 78 | + # Set the bucket size for weight update |
| 79 | + "--update-weight-buffer-size 536870912 " # 512MB |
| 80 | + "--train-backend fsdp " |
| 81 | + "--gradient-checkpointing " |
| 82 | + "--sglang-attention-backend fa3 " |
| 83 | + "--attn-implementation flash_attention_3 " |
| 84 | + ) |
| 85 | + |
| 86 | + ci_args = ( |
| 87 | + "--ci-test " |
| 88 | + "--ci-disable-kl-checker " |
| 89 | + "--ci-metric-checker-key eval/geo3k " |
| 90 | + "--ci-metric-checker-threshold 0.5 " # loose threshold at 60 step |
| 91 | + ) |
| 92 | + |
| 93 | + misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate " |
| 94 | + |
| 95 | + # misc_args += ( |
| 96 | + # "--use-dynamic-batch-size " |
| 97 | + # # TODO pick a good value |
| 98 | + # "--max-tokens-per-gpu 2048 " |
| 99 | + # ) |
| 100 | + |
| 101 | + true_on_policy_args = ( |
| 102 | + "--sglang-enable-deterministic-inference " |
| 103 | + "--sglang-rl-on-policy-target fsdp " |
| 104 | + "--deterministic-mode " |
| 105 | + "--true-on-policy-mode " |
| 106 | + ) |
| 107 | + true_on_policy_envs = { |
| 108 | + # TODO note: "Ring" in original RL PR, "allreduce:tree" in SGLang |
| 109 | + # "NCCL_ALGO": "Ring", |
| 110 | + "NCCL_ALGO": "allreduce:tree", |
| 111 | + "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0", |
| 112 | + "CUBLAS_WORKSPACE_CONFIG": ":4096:8", |
| 113 | + "SGLANG_VLM_CACHE_SIZE_MB": "0", |
| 114 | + } |
| 115 | + |
| 116 | + train_args = ( |
| 117 | + f"{ckpt_args} " |
| 118 | + f"{rollout_args} " |
| 119 | + f"{optimizer_args} " |
| 120 | + f"{grpo_args} " |
| 121 | + f"{sglang_args} " |
| 122 | + f"{fsdp_args} " |
| 123 | + f"{ci_args} " |
| 124 | + f"{eval_args} " |
| 125 | + f"{misc_args} " |
| 126 | + f"{get_default_wandb_args(__file__)} " |
| 127 | + f"{true_on_policy_args} " |
| 128 | + ) |
| 129 | + |
| 130 | + # Kill existing processes |
| 131 | + U.exec_command( |
| 132 | + "pkill -9 sglang; " |
| 133 | + "sleep 3; " |
| 134 | + f"{'' if EXTERNAL_RAY else 'ray stop --force; '}" |
| 135 | + f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}" |
| 136 | + "pkill -9 slime; " |
| 137 | + "sleep 3; " |
| 138 | + f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}" |
| 139 | + "pkill -9 slime; " |
| 140 | + "pkill -9 redis; " |
| 141 | + "true; " |
| 142 | + ) |
| 143 | + |
| 144 | + if not EXTERNAL_RAY: |
| 145 | + # Start Ray |
| 146 | + U.exec_command( |
| 147 | + f"export PYTHONBUFFERED=16 && " |
| 148 | + f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} " |
| 149 | + f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265" |
| 150 | + ) |
| 151 | + |
| 152 | + # Submit Ray job |
| 153 | + execute_train( |
| 154 | + train_args=train_args, |
| 155 | + num_gpus_per_node=NUM_GPUS, |
| 156 | + megatron_model_type=None, |
| 157 | + extra_env_vars={ |
| 158 | + **true_on_policy_envs, |
| 159 | + }, |
| 160 | + ) |
| 161 | + |
| 162 | + |
| 163 | +if __name__ == "__main__": |
| 164 | + prepare() |
| 165 | + execute() |
0 commit comments