[FSDP, VLM] feat: true on policy for VLM (#1056)

nanjiangwill · web-flow · commit a9cfd75728e3 · 2025-12-10T12:45:54.000+08:00
diff --git a/docker/patch/latest/sglang.patch b/docker/patch/latest/sglang.patch
diff --git a/examples/geo3k_vlm/run_geo3k_vlm.py b/examples/geo3k_vlm/run_geo3k_vlm.py
@@ -1,8 +1,7 @@
-import json
 import os
-import subprocess
 
 import slime.utils.misc as U
+from slime.utils.external_utils.command_utils import execute_train, get_default_wandb_args
 
 MODEL_NAME = os.environ.get("SLIME_SCRIPT_MODEL_NAME", "Qwen3-VL-2B-Instruct")
 assert MODEL_NAME in {"Qwen2.5-VL-3B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct"}
@@ -12,19 +11,6 @@
 MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1")
 
 
-def detect_nvlink():
-    """Detect if NVLink is available on the system."""
-    try:
-        result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=True)
-        nvlink_count = result.stdout.count("NVLink")
-        has_nvlink = 1 if nvlink_count > 0 else 0
-        print(f"HAS_NVLINK: {has_nvlink} (detected {nvlink_count} NVLink references)")
-        return has_nvlink
-    except Exception as e:
-        print(f"Failed to detect NVLink: {e}")
-        return 0
-
-
 def prepare():
     U.exec_command("mkdir -p /root/models /root/datasets")
     U.exec_command(f"hf download Qwen/{MODEL_NAME} --local-dir /root/models/{MODEL_NAME}")
@@ -34,8 +20,6 @@ def prepare():
 
 
 def execute():
-    # Detect NVLink for optimized NCCL settings
-    has_nvlink = detect_nvlink()
 
     ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} "
 
@@ -57,7 +41,7 @@ def execute():
 
     eval_args = (
         # "--eval-interval 20 "
-        "--eval-prompt-data geo3k-test /root/datasets/geo3k_imgurl/test.parquet "
+        "--eval-prompt-data geo3k /root/datasets/geo3k_imgurl/test.parquet "
         "--n-samples-per-eval-prompt 1 "
         "--eval-max-response-len 4096 "
         "--eval-top-k 1 "
@@ -100,14 +84,6 @@ def execute():
         "--attn-implementation flash_attention_3 "
     )
 
-    wandb_args = (
-        "--use-wandb "
-        "--wandb-project geo3k-vlm "
-        "--wandb-group geo3k-vlm "
-        "--wandb-key ${WANDB_API_KEY} "
-        "--disable-wandb-random-suffix "
-    )
-
     misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate "
 
     # misc_args += (
@@ -139,7 +115,7 @@ def execute():
         f"{fsdp_args} "
         f"{eval_args} "
         f"{misc_args} "
-        f"{wandb_args} "
+        f"{get_default_wandb_args(__file__)} "
         # f"{true_on_policy_args} "
     )
 
@@ -164,27 +140,12 @@ def execute():
             f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} "
             f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265"
         )
-
-    # Prepare runtime environment
-    runtime_env_json = json.dumps(
-        {
-            "env_vars": {
-                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
-                "NCCL_NVLS_ENABLE": str(has_nvlink),
-                # **true_on_policy_envs,
-                # "SGLANG_DUMPER_ENABLE": "0",
-                # "SGLANG_TEMP_UTILS_ENABLE_DEBUG_PRINT": "0",
-            }
-        }
-    )
-
     # Submit Ray job
-    U.exec_command(
-        f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && "
-        f'ray job submit --address="http://127.0.0.1:8265" '
-        f"--runtime-env-json='{runtime_env_json}' "
-        f"-- python3 /root/slime/train.py "
-        f"{train_args}"
+    execute_train(
+        train_args=train_args,
+        num_gpus_per_node=NUM_GPUS,
+        megatron_model_type=None,
+        extra_env_vars={},
     )
 
 
diff --git a/examples/true_on_policy_vlm/README.md b/examples/true_on_policy_vlm/README.md
@@ -0,0 +1,9 @@
+# True On-Policy between Training and Inference for VLM
+
+This example demonstrates true on-policy training with Qwen3-VL dense model on FSDP. The core concepts and expected observations are the same as [true_on_policy](../true_on_policy/README.md).
+
+## Usage
+
+```bash
+python examples/true_on_policy_vlm/run_simple.py
+```
diff --git a/examples/true_on_policy_vlm/run_simple.py b/examples/true_on_policy_vlm/run_simple.py
@@ -0,0 +1,165 @@
+import os
+
+import slime.utils.misc as U
+from slime.utils.external_utils.command_utils import execute_train, get_default_wandb_args
+
+MODEL_NAME = os.environ.get("SLIME_SCRIPT_MODEL_NAME", "Qwen3-VL-2B-Instruct")
+assert MODEL_NAME in {"Qwen2.5-VL-3B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct"}
+
+NUM_GPUS = int(os.environ.get("SLIME_SCRIPT_NUM_GPUS", "1"))
+EXTERNAL_RAY = int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0"))
+MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+
+def prepare():
+    U.exec_command("mkdir -p /root/models /root/datasets")
+    U.exec_command(f"hf download Qwen/{MODEL_NAME} --local-dir /root/models/{MODEL_NAME}")
+    dataset_name = "chenhegu/geo3k_imgurl"
+    _, partial_name = dataset_name.split("/")
+    U.exec_command(f"hf download --repo-type dataset {dataset_name} --local-dir /root/datasets/{partial_name}")
+
+
+def execute():
+    ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} "
+
+    rollout_args = (
+        "--prompt-data /root/datasets/geo3k_imgurl/train.parquet "
+        "--input-key problem "
+        "--label-key answer "
+        '--multimodal-keys \'{"image": "images"}\' '
+        "--apply-chat-template "
+        "--rollout-shuffle "
+        "--rm-type math "
+        "--num-rollout 3000 "
+        "--rollout-batch-size 64 "
+        "--n-samples-per-prompt 8 "
+        "--rollout-max-response-len 4096 "
+        "--rollout-temperature 0.8 "
+        "--global-batch-size 512 "
+    )
+
+    eval_args = (
+        # "--eval-interval 20 "
+        "--eval-prompt-data geo3k /root/datasets/geo3k_imgurl/test.parquet "
+        "--n-samples-per-eval-prompt 1 "
+        "--eval-max-response-len 4096 "
+        "--eval-top-k 1 "
+    )
+
+    grpo_args = (
+        "--advantage-estimator grpo "
+        # "--use-kl-loss "
+        "--kl-loss-coef 0.00 "
+        "--kl-loss-type low_var_kl "
+        "--kl-coef 0.00 "
+        "--entropy-coef 0.00 "
+        "--eps-clip 0.2 "
+        "--eps-clip-high 0.28 "
+    )
+
+    optimizer_args = (
+        "--optimizer adam "
+        "--lr 1e-6 "
+        "--lr-decay-style constant "
+        "--weight-decay 0.1 "
+        "--adam-beta1 0.9 "
+        "--adam-beta2 0.98 "
+    )
+
+    sglang_args = (
+        "--rollout-num-gpus-per-engine 1 "
+        "--sglang-mem-fraction-static 0.6 "
+        f"--sglang-cuda-graph-bs {' '.join(map(str, [1, 2, 4, 8] + list(range(16, 257, 8))))} "
+    )
+
+    fsdp_args = (
+        # Set to true for FULL_STATE_DICT mode, false for SHARDED_STATE_DICT mode (default)
+        # "--fsdp-full-params "  # Uncomment this line to enable full params mode
+        # Set the bucket size for weight update
+        "--update-weight-buffer-size 536870912 "  # 512MB
+        "--train-backend fsdp "
+        "--gradient-checkpointing "
+        "--sglang-attention-backend fa3 "
+        "--attn-implementation flash_attention_3 "
+    )
+
+    ci_args = (
+        "--ci-test "
+        "--ci-disable-kl-checker "
+        "--ci-metric-checker-key eval/geo3k "
+        "--ci-metric-checker-threshold 0.5 "  # loose threshold at 60 step
+    )
+
+    misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate "
+
+    # misc_args += (
+    #     "--use-dynamic-batch-size "
+    #     # TODO pick a good value
+    #     "--max-tokens-per-gpu 2048 "
+    # )
+
+    true_on_policy_args = (
+        "--sglang-enable-deterministic-inference "
+        "--sglang-rl-on-policy-target fsdp "
+        "--deterministic-mode "
+        "--true-on-policy-mode "
+    )
+    true_on_policy_envs = {
+        # TODO note: "Ring" in original RL PR, "allreduce:tree" in SGLang
+        # "NCCL_ALGO": "Ring",
+        "NCCL_ALGO": "allreduce:tree",
+        "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
+        "CUBLAS_WORKSPACE_CONFIG": ":4096:8",
+        "SGLANG_VLM_CACHE_SIZE_MB": "0",
+    }
+
+    train_args = (
+        f"{ckpt_args} "
+        f"{rollout_args} "
+        f"{optimizer_args} "
+        f"{grpo_args} "
+        f"{sglang_args} "
+        f"{fsdp_args} "
+        f"{ci_args} "
+        f"{eval_args} "
+        f"{misc_args} "
+        f"{get_default_wandb_args(__file__)} "
+        f"{true_on_policy_args} "
+    )
+
+    # Kill existing processes
+    U.exec_command(
+        "pkill -9 sglang; "
+        "sleep 3; "
+        f"{'' if EXTERNAL_RAY else 'ray stop --force; '}"
+        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
+        "pkill -9 slime; "
+        "sleep 3; "
+        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
+        "pkill -9 slime; "
+        "pkill -9 redis; "
+        "true; "
+    )
+
+    if not EXTERNAL_RAY:
+        # Start Ray
+        U.exec_command(
+            f"export PYTHONBUFFERED=16 && "
+            f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} "
+            f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265"
+        )
+
+    # Submit Ray job
+    execute_train(
+        train_args=train_args,
+        num_gpus_per_node=NUM_GPUS,
+        megatron_model_type=None,
+        extra_env_vars={
+            **true_on_policy_envs,
+        },
+    )
+
+
+if __name__ == "__main__":
+    prepare()
+    execute()
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -147,7 +147,8 @@ def init(self, args: Namespace, role: str, with_ref: bool = False) -> int:  # ty
         return int(getattr(self.args, "start_rollout_id", 0))
 
     def get_model_cls(self):
-        if self.args.multimodal_keys:
+        # Vision models have `vision_config` in the config
+        if hasattr(self.hf_config, "vision_config"):
             from transformers import AutoModelForVision2Seq
 
             return AutoModelForVision2Seq
diff --git a/slime/utils/data.py b/slime/utils/data.py
@@ -73,7 +73,7 @@ def _build_messages(data: dict, prompt_key: str, multimodal_keys: dict = None):
         for type_name, data_key in multimodal_keys.items():
             mt = MultimodalTypes.get(type_name)
             if mt:
-                multimodals[mt.placeholder] = (mt, data.get(data_key).tolist())
+                multimodals[mt.placeholder] = (mt, list(data.get(data_key)))
 
         pattern = "(" + "|".join(re.escape(p) for p in multimodals.keys()) + ")"
 
diff --git a/slime/utils/processing_utils.py b/slime/utils/processing_utils.py
@@ -72,5 +72,5 @@ def encode_image_for_rollout_engine(image) -> str:
     buffer = io.BytesIO()
     if image.mode != "RGB":
         image = image.convert("RGB")
-    image.save(buffer, format="JPEG")
+    image.save(buffer, format="PNG")
     return base64.b64encode(buffer.getvalue()).decode("utf-8")