[VLM] fix: fix non true-on-policy vlm regression (#1093)

nanjiangwill · web-flow · commit a84d93b09338 · 2025-12-12T08:27:51.000+08:00
diff --git a/docker/patch/latest/sglang.patch b/docker/patch/latest/sglang.patch
@@ -215,7 +215,7 @@ index 932f52aeb..79c6b664f 100644
  
          hidden_states = self._communicate_simple_fn(
 diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
-index 3293a8a59..02999afd0 100644
+index 3293a8a59..a075b71ce 100644
 --- a/python/sglang/srt/layers/layernorm.py
 +++ b/python/sglang/srt/layers/layernorm.py
 @@ -84,15 +84,12 @@ class RMSNorm(CustomOp):
@@ -236,7 +236,7 @@ index 3293a8a59..02999afd0 100644
          self.variance_epsilon = eps
          self.hidden_size = hidden_size
          self.variance_size_override = (
-@@ -105,15 +102,16 @@ class RMSNorm(CustomOp):
+@@ -105,21 +102,26 @@ class RMSNorm(CustomOp):
          self,
          x: torch.Tensor,
          residual: Optional[torch.Tensor] = None,
@@ -255,7 +255,17 @@ index 3293a8a59..02999afd0 100644
              return rms_norm_batch_invariant(
                  x,
                  self.weight.data,
-@@ -179,17 +177,35 @@ class RMSNorm(CustomOp):
+                 self.variance_epsilon,
+             )
+         if residual is not None:
++            # TODO: Ideally we want to have (a+b)+c. but right now we can only have a+(b+c).
++            # (a+b)+c != a+(b+c), we probably need to add another parameter to fused_add_rmsnorm
++            if post_residual_addition is not None:
++                residual = residual + post_residual_addition
+             fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+             return x, residual
+         out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+@@ -179,17 +181,35 @@ class RMSNorm(CustomOp):
          self,
          x: torch.Tensor,
          residual: Optional[torch.Tensor] = None,
diff --git a/examples/geo3k_vlm/run_geo3k_vlm.py b/examples/geo3k_vlm/run_geo3k_vlm.py
@@ -8,7 +8,6 @@
 
 NUM_GPUS = int(os.environ.get("SLIME_SCRIPT_NUM_GPUS", "1"))
 EXTERNAL_RAY = int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0"))
-MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1")
 
 
 def prepare():
@@ -40,7 +39,7 @@ def execute():
     )
 
     eval_args = (
-        # "--eval-interval 20 "
+        "--eval-interval 20 "
         "--eval-prompt-data geo3k /root/datasets/geo3k_imgurl/test.parquet "
         "--n-samples-per-eval-prompt 1 "
         "--eval-max-response-len 4096 "
@@ -119,27 +118,6 @@ def execute():
         # f"{true_on_policy_args} "
     )
 
-    # Kill existing processes
-    U.exec_command(
-        "pkill -9 sglang; "
-        "sleep 3; "
-        f"{'' if EXTERNAL_RAY else 'ray stop --force; '}"
-        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
-        "pkill -9 slime; "
-        "sleep 3; "
-        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
-        "pkill -9 slime; "
-        "pkill -9 redis; "
-        "true; "
-    )
-
-    if not EXTERNAL_RAY:
-        # Start Ray
-        U.exec_command(
-            f"export PYTHONBUFFERED=16 && "
-            f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} "
-            f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265"
-        )
     # Submit Ray job
     execute_train(
         train_args=train_args,
diff --git a/examples/true_on_policy_vlm/README.md b/examples/true_on_policy_vlm/README.md
@@ -5,6 +5,7 @@ This example demonstrates true on-policy training with Qwen3-VL dense model on F
 <p align="center">
   <img src="diff.png" alt="Training Inference Log Prob Diff" width="800">
 </p>
+
 ## Usage
 
 ```bash
diff --git a/examples/true_on_policy_vlm/run_simple.py b/examples/true_on_policy_vlm/run_simple.py
@@ -8,7 +8,6 @@
 
 NUM_GPUS = int(os.environ.get("SLIME_SCRIPT_NUM_GPUS", "1"))
 EXTERNAL_RAY = int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0"))
-MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1")
 
 
 def prepare():
@@ -39,7 +38,7 @@ def execute():
     )
 
     eval_args = (
-        # "--eval-interval 20 "
+        "--eval-interval 20 "
         "--eval-prompt-data geo3k /root/datasets/geo3k_imgurl/test.parquet "
         "--n-samples-per-eval-prompt 1 "
         "--eval-max-response-len 4096 "
@@ -127,28 +126,6 @@ def execute():
         f"{true_on_policy_args} "
     )
 
-    # Kill existing processes
-    U.exec_command(
-        "pkill -9 sglang; "
-        "sleep 3; "
-        f"{'' if EXTERNAL_RAY else 'ray stop --force; '}"
-        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
-        "pkill -9 slime; "
-        "sleep 3; "
-        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
-        "pkill -9 slime; "
-        "pkill -9 redis; "
-        "true; "
-    )
-
-    if not EXTERNAL_RAY:
-        # Start Ray
-        U.exec_command(
-            f"export PYTHONBUFFERED=16 && "
-            f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} "
-            f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265"
-        )
-
     # Submit Ray job
     execute_train(
         train_args=train_args,
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -80,7 +80,8 @@ def init(self, args: Namespace, role: str, with_ref: bool = False) -> int:  # ty
             if i == dist.get_rank():
                 self.hf_config = AutoConfig.from_pretrained(self.args.hf_checkpoint, trust_remote_code=True)
                 self.tokenizer = load_tokenizer(self.args.hf_checkpoint, trust_remote_code=True)
-                if self.args.multimodal_keys:
+                # Vision models have `vision_config` in the config
+                if hasattr(self.hf_config, "vision_config"):
                     self.processor = load_processor(self.args.hf_checkpoint, trust_remote_code=True)
             dist.barrier(group=get_gloo_group())