fix the timestep init and sigma

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 7ab4a3fbfcdb · 2025-01-18T23:59:24.000+08:00
diff --git a/scripts/convert_cogview4_to_diffusers.py b/scripts/convert_cogview4_to_diffusers.py
@@ -231,7 +231,7 @@ def main(args):
             "prediction_type": "v_prediction",
             "rescale_betas_zero_snr": True,
             "set_alpha_to_one": True,
-            "timestep_spacing": "trailing",
+            "timestep_spacing": "linspace",
         }
     )
 
diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4.py
@@ -16,6 +16,7 @@
 import inspect
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
+import math
 import torch
 from transformers import GlmModel
 
@@ -53,7 +54,19 @@
 """
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def calculate_shift(
+    image_seq_len, base_seq_len: int = 256, max_seq_len: int = 4096, base_shift: float = 0.5, max_shift: float = 1.15
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+def time_shift(mu: float, shift_sigma: float, sigmas: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1) ** shift_sigma)
+
+
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
@@ -203,7 +216,7 @@ def _get_glm_embeds(
             )
             text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
 
-        prompt_embeds = self.text_encoder.model.embed_tokens(text_input_ids)[0]
+        prompt_embeds = self.text_encoder.model.embed_tokens(text_input_ids.to(self.text_encoder.model.device))[0]
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
         seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -573,6 +586,16 @@ def __call__(
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        image_seq_len = ((height // self.vae_scale_factor) * (width // self.vae_scale_factor)) // (
+                self.transformer.config.patch_size ** 2
+        )
+        mu = calculate_shift(image_seq_len)
+        sigmas = timesteps / self.scheduler.config.num_train_timesteps
+        sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])  # Append zero at the end
+
+        self.sigmas = time_shift(mu, 1.0, sigmas) # This is for noisy contr
+
         self._num_timesteps = len(timesteps)
 
         # 5. Prepare latents.
@@ -611,17 +634,81 @@ def __call__(
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             # for DPM-solver++
             old_pred_original_sample = None
+            # for i, t in enumerate(timesteps):
+            #     if self.interrupt:
+            #         continue
+            #
+            #     latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            #     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            #
+            #     # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            #     timestep = t.expand(latent_model_input.shape[0])
+            #
+            #     # predict noise model_output
+            #     noise_pred = self.transformer(
+            #         hidden_states=latent_model_input,
+            #         encoder_hidden_states=prompt_embeds,
+            #         timestep=timestep,
+            #         original_size=original_size,
+            #         target_size=target_size,
+            #         crop_coords=crops_coords_top_left,
+            #         return_dict=False,
+            #     )[0]
+            #     noise_pred = noise_pred.float()
+            #
+            #     # perform guidance
+            #     if self.do_classifier_free_guidance:
+            #         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            #         noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            #
+            #     # compute the previous noisy sample x_t -> x_t-1
+            #     if not isinstance(self.scheduler, CogView4DDIMScheduler):
+            #         latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            #     else:
+            #         latents, old_pred_original_sample = self.scheduler.step(
+            #             model_output=noise_pred,
+            #             timestep=t,
+            #             sample=latents,
+            #             **extra_step_kwargs,
+            #             return_dict=False,
+            #         )
+            #     latents = latents.to(prompt_embeds.dtype)
+            #
+            #     # call the callback, if provided
+            #     if callback_on_step_end is not None:
+            #         callback_kwargs = {}
+            #         for k in callback_on_step_end_tensor_inputs:
+            #             callback_kwargs[k] = locals()[k]
+            #         callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+            #
+            #         latents = callback_outputs.pop("latents", latents)
+            #         prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            #         negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+            #
+            #     if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+            #         progress_bar.update()
+            #
+            #     if XLA_AVAILABLE:
+            #         xm.mark_step()
+            # 假设 sigmas 已经计算好了，和之前的步骤一样
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
 
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # 获取当前的 sigma 和下一个时间步的 sigma
+                sigma = sigmas[i]
+                sigma_next = sigmas[i + 1] if i + 1 < len(sigmas) else sigma  # 防止越界
+
+                # 根据 sigmas 修改 latent 模型输入
+                latent_model_input = latents * sigma  # 使用当前 sigma 调整 latents
+                latent_model_input = torch.cat(
+                    [latent_model_input] * 2) if self.do_classifier_free_guidance else latent_model_input
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                # 广播到 batch 维度，以便与 ONNX/Core ML 兼容
                 timestep = t.expand(latent_model_input.shape[0])
 
-                # predict noise model_output
+                # 预测噪声
                 noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     encoder_hidden_states=prompt_embeds,
@@ -633,25 +720,18 @@ def __call__(
                 )[0]
                 noise_pred = noise_pred.float()
 
-                # perform guidance
+                # 执行引导
                 if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
-                # compute the previous noisy sample x_t -> x_t-1
-                if not isinstance(self.scheduler, CogView4DDIMScheduler):
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                else:
-                    latents, old_pred_original_sample = self.scheduler.step(
-                        model_output=noise_pred,
-                        timestep=t,
-                        sample=latents,
-                        **extra_step_kwargs,
-                        return_dict=False,
-                    )
+                # 根据预测的噪声和 sigmas 更新 latents
+                latents = latents + (sigma_next - sigma) * noise_pred  # 使用 sigmas 计算新的 latents
+
+                # 或者使用更新后的 latents 进行下一步计算
                 latents = latents.to(prompt_embeds.dtype)
 
-                # call the callback, if provided
+                # 如果有回调，执行回调
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
                     for k in callback_on_step_end_tensor_inputs:
@@ -667,7 +747,6 @@ def __call__(
 
                 if XLA_AVAILABLE:
                     xm.mark_step()
-
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
                 0
diff --git a/src/diffusers/schedulers/scheduling_ddim_cogview4.py b/src/diffusers/schedulers/scheduling_ddim_cogview4.py
@@ -318,10 +318,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         # Generate timesteps according to the specified spacing method
         if self.config.timestep_spacing == "linspace":
             timesteps = (
-                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
-                .round()[::-1]
-                .copy()
-                .astype(np.int64)
+                np.linspace(self.config.num_train_timesteps, 1, num_inference_steps)
+                .astype(np.int64)  # Only for CogView4
             )
         elif self.config.timestep_spacing == "leading":
             step_ratio = self.config.num_train_timesteps // self.num_inference_steps
@@ -339,28 +337,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         # Convert the numpy array of timesteps into a PyTorch tensor
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-        # ===== change for cogview4 ====
-        # The new dynamic shifting code starts here.
-
-        # Convert integer timesteps to float for further manipulation
-        times_float = self.timesteps.float() / float(self.config.num_train_timesteps)
-
-        # Apply the shift_scale factor
-        times_float = self.config.shift_scale * times_float
-
-        # Convert the shifted floats back to integer indices for timesteps
-        new_timesteps = (times_float * self.config.num_train_timesteps).round().long().clamp_min(0)
-
-        # Ensure the timesteps are in descending order and unique
-        new_timesteps = new_timesteps.unique().flip(0)
-        if len(new_timesteps) == 0:
-            # If all values somehow got collapsed, fallback to a single timestep
-            new_timesteps = torch.zeros(1, dtype=torch.long, device=device)
-
-        # Overwrite the original timesteps with our newly shifted timesteps
-        self.timesteps = new_timesteps
-        # =====
-
     def step(
         self,
         model_output: torch.Tensor,

Original file line number	Diff line number	Diff line change
`@@ -231,7 +231,7 @@ def main(args):`
`231`	`231`	`"prediction_type": "v_prediction",`
`232`	`232`	`"rescale_betas_zero_snr": True,`
`233`	`233`	`"set_alpha_to_one": True,`
`234`		`- "timestep_spacing": "trailing",`
	`234`	`+ "timestep_spacing": "linspace",`
`235`	`235`	`}`
`236`	`236`	`)`
`237`	`237`