[diffusion] fix: precompute rollout step index and correct cps log_prob (#18806)

MikukuOvO · MikukuOvO · commit 4abb27b24c13 · 2026-03-02T04:42:20.000Z
diff --git a/python/sglang/multimodal_gen/runtime/pipelines/patches/flow_matching_with_logprob.py b/python/sglang/multimodal_gen/runtime/pipelines/patches/flow_matching_with_logprob.py
@@ -8,31 +8,11 @@
 from diffusers.utils.torch_utils import randn_tensor
 
 
-def _as_timestep_tensor(
-    timestep: Union[float, torch.Tensor], batch_size: int, device: torch.device
-) -> torch.Tensor:
-    """Normalize timestep input to a 1D tensor on the target device."""
-    if torch.is_tensor(timestep):
-        ts = timestep.to(device=device)
-    else:
-        ts = torch.tensor([timestep], device=device)
-
-    if ts.ndim == 0:
-        ts = ts.view(1)
-    else:
-        ts = ts.view(-1)
-
-    # Broadcast scalar timestep to match batch size.
-    if ts.numel() == 1 and batch_size > 1:
-        ts = ts.repeat(batch_size)
-    return ts
-
-
 def sde_step_with_logprob(
     self: Any,
     model_output: torch.FloatTensor,
-    timestep: Union[float, torch.FloatTensor],
     sample: torch.FloatTensor,
+    step_index: int,
     noise_level: float = 0.7,
     prev_sample: Optional[torch.FloatTensor] = None,
     generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
@@ -49,10 +29,9 @@ def sde_step_with_logprob(
     if prev_sample is not None:
         prev_sample = prev_sample.float()
 
-    batch_size = sample.shape[0]
-    timestep_tensor = _as_timestep_tensor(timestep, batch_size, sample.device)
-    step_indices = torch.tensor(
-        [self.index_for_timestep(t.to(self.timesteps.device)) for t in timestep_tensor],
+    step_indices = torch.full(
+        (sample.shape[0],),
+        int(step_index),
         device=self.sigmas.device,
         dtype=torch.long,
     )
@@ -112,8 +91,13 @@ def sde_step_with_logprob(
             )
             prev_sample = prev_sample_mean + std_dev_t * variance_noise
 
-        # Keep the same simplified cps objective used in the original patch.
-        log_prob = -((prev_sample.detach() - prev_sample_mean) ** 2)
+        # CPS transition is Gaussian with std_dev_t, so compute a valid log-probability.
+        std = std_dev_t.clamp_min(1e-12)
+        log_prob = (
+            -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std**2))
+            - torch.log(std)
+            - torch.log(torch.sqrt(torch.as_tensor(2 * math.pi, device=std.device)))
+        )
     else:
         raise ValueError(f"Unsupported sde_type: {sde_type}")
 
diff --git a/python/sglang/multimodal_gen/runtime/pipelines_core/stages/denoising.py b/python/sglang/multimodal_gen/runtime/pipelines_core/stages/denoising.py
@@ -1021,6 +1021,13 @@ def forward(
         is_warmup = batch.is_warmup
         self.scheduler.set_begin_index(0)
         timesteps_cpu = timesteps.cpu()
+        rollout_step_indices: list[int] = []
+        if rollout_enabled:
+            scheduler_timesteps = self.scheduler.timesteps
+            rollout_step_indices = [
+                self.scheduler.index_for_timestep(t.to(scheduler_timesteps.device))
+                for t in timesteps_cpu
+            ]
         num_timesteps = timesteps_cpu.shape[0]
         with torch.autocast(
             device_type=current_platform.device_type,
@@ -1104,8 +1111,8 @@ def forward(
                             latents, step_log_prob = sde_step_with_logprob(
                                 self.scheduler,
                                 model_output=noise_pred,
-                                timestep=t_device,
                                 sample=latents,
+                                step_index=rollout_step_indices[i],
                                 generator=batch.generator,
                                 sde_type=rollout_sde_type,
                                 noise_level=rollout_noise_level,