Notes, sana schedule, scale_noise->add_noise

hlky · hlky · commit cc849e2ff1dc · 2024-12-19T11:07:55.000Z
diff --git a/src/diffusers/pipelines/flux/pipeline_flux_img2img.py b/src/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -562,7 +562,9 @@ def prepare_latents(
             image_latents = torch.cat([image_latents], dim=0)
 
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        # NOTE: `scale_noise` changed to `add_noise`
+        # the signature is `noise`, `timestep` instead of `timestep`, `noise`
+        latents = self.scheduler.add_noise(image_latents, noise, timestep)
         latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
         return latents, latent_image_ids
 
diff --git a/src/diffusers/schedulers/schedules/beta_schedule.py b/src/diffusers/schedulers/schedules/beta_schedule.py
@@ -1,14 +1,14 @@
+import math
 from typing import List, Optional, Union
 
-import math
 import numpy as np
 import torch
 
-from ...configuration_utils import ConfigMixin, register_to_config
 from ..sigmas.beta_sigmas import BetaSigmas
 from ..sigmas.exponential_sigmas import ExponentialSigmas
 from ..sigmas.karras_sigmas import KarrasSigmas
 
+
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
@@ -52,6 +52,7 @@ def alpha_bar_fn(t):
         betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
+
 def rescale_zero_terminal_snr(betas):
     """
     Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
@@ -89,7 +90,6 @@ def rescale_zero_terminal_snr(betas):
 
 
 class BetaSchedule:
-
     scale_model_input = True
 
     def __init__(
@@ -132,7 +132,7 @@ def __init__(
             # Close to 0 without being 0 so first sigma is not inf
             # FP16 smallest positive subnormal works well here
             self.alphas_cumprod[-1] = 2**-24
-        
+
         self.num_train_timesteps = num_train_timesteps
         self.beta_start = beta_start
         self.beta_end = beta_end
@@ -181,6 +181,7 @@ def __call__(
     ):
         if sigmas is not None:
             log_sigmas = np.log(np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5))
+            # NOTE: current usage is **with** `sigma_last` - different than FlowMatch.
             sigmas = np.array(sigmas).astype(np.float32)
             timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas[:-1]])
 
@@ -190,9 +191,9 @@ def __call__(
             else:
                 # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
                 if self.timestep_spacing == "linspace":
-                    timesteps = np.linspace(
-                        0, self.num_train_timesteps - 1, num_inference_steps, dtype=np.float32
-                    )[::-1].copy()
+                    timesteps = np.linspace(0, self.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                        ::-1
+                    ].copy()
                 elif self.timestep_spacing == "leading":
                     step_ratio = self.num_train_timesteps // num_inference_steps
                     # creates integer timesteps by multiplying by ratio
@@ -205,9 +206,7 @@ def __call__(
                     step_ratio = self.num_train_timesteps / num_inference_steps
                     # creates integer timesteps by multiplying by ratio
                     # casting to int to avoid issues when num_inference_step is power of 3
-                    timesteps = (
-                        (np.arange(self.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
-                    )
+                    timesteps = (np.arange(self.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
                     timesteps -= 1
                 else:
                     raise ValueError(
@@ -240,13 +239,13 @@ def __call__(
                 )
 
             sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
-        
+
         sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
 
         # TODO: Support the full EDM scalings for all prediction types and timestep types
         if self.timestep_type == "continuous" and self.prediction_type == "v_prediction":
             timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas[:-1]]).to(device=device)
         else:
             timesteps = torch.from_numpy(timesteps.astype(np.float32)).to(device=device)
-        
+
         return sigmas, timesteps
diff --git a/src/diffusers/schedulers/schedules/flow_schedule.py b/src/diffusers/schedulers/schedules/flow_schedule.py
@@ -1,21 +1,32 @@
+import math
 from typing import List, Optional, Union
 
-import math
 import numpy as np
 import torch
 
-from ...configuration_utils import ConfigMixin, register_to_config
 from ..sigmas.beta_sigmas import BetaSigmas
 from ..sigmas.exponential_sigmas import ExponentialSigmas
 from ..sigmas.karras_sigmas import KarrasSigmas
 
-class FlowMatchSD3:
-    
-    def _sigma_to_t(self, sigma):
-        return sigma * self.num_train_timesteps
 
-    def __call__(self, num_inference_steps: int, num_train_timesteps: int, shift: float, use_dynamic_shifting: bool = False, **kwargs):
-        self.num_train_timesteps = num_train_timesteps
+class FlowMatchSD3:
+    def __call__(
+        self,
+        num_inference_steps: int,
+        num_train_timesteps: int,
+        shift: float,
+        use_dynamic_shifting: bool = False,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        This is different to others that directly calculate `sigmas`.
+        It needs `sigma_min` and `sigma_max` after shift
+        https://github.com/huggingface/diffusers/blob/0ed09a17bbab784a78fb163b557b4827467b0468/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py#L89-L95
+        Then we calculate `sigmas` from that `sigma_min` and `sigma_max`.
+        https://github.com/huggingface/diffusers/blob/0ed09a17bbab784a78fb163b557b4827467b0468/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py#L238-L240
+        Shifting happens again after (outside of this).
+        https://github.com/huggingface/diffusers/blob/0ed09a17bbab784a78fb163b557b4827467b0468/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py#L248-L251
+        """
         timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
         timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
 
@@ -25,18 +36,20 @@ def __call__(self, num_inference_steps: int, num_train_timesteps: int, shift: fl
             sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
         sigma_min = sigmas[-1].item()
         sigma_max = sigmas[0].item()
-        timesteps = np.linspace(
-            self._sigma_to_t(sigma_max), self._sigma_to_t(sigma_min), num_inference_steps
-        )
+        timesteps = np.linspace(sigma_max * num_train_timesteps, sigma_min * num_train_timesteps, num_inference_steps)
         sigmas = timesteps / num_train_timesteps
         return sigmas
 
+
 class FlowMatchFlux:
-    def __call__(self, num_inference_steps: int, **kwargs):
+    def __call__(self, num_inference_steps: int, **kwargs) -> np.ndarray:
         return np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
 
+
 class FlowMatchLinearQuadratic:
-    def __call__(self, num_inference_steps: int, threshold_noise: float = 0.25, linear_steps: Optional[int] = None, **kwargs):
+    def __call__(
+        self, num_inference_steps: int, threshold_noise: float = 0.25, linear_steps: Optional[int] = None, **kwargs
+    ) -> np.ndarray:
         if linear_steps is None:
             linear_steps = num_inference_steps // 2
         linear_sigma_schedule = [i * threshold_noise / linear_steps for i in range(linear_steps)]
@@ -49,22 +62,33 @@ def __call__(self, num_inference_steps: int, threshold_noise: float = 0.25, line
             quadratic_coef * (i**2) + linear_coef * i + const for i in range(linear_steps, num_inference_steps)
         ]
         sigma_schedule = linear_sigma_schedule + quadratic_sigma_schedule
-        sigma_schedule = [1.0 - x for x in sigma_schedule]
+        sigma_schedule = np.array([1.0 - x for x in sigma_schedule]).astype(np.float32)
         return sigma_schedule
 
+
 class FlowMatchHunyuanVideo:
-    def __call__(self, num_inference_steps: int, **kwargs):
+    def __call__(self, num_inference_steps: int, **kwargs) -> np.ndarray:
         return np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1].copy()
 
+
+class FlowMatchSANA:
+    def __call__(self, num_inference_steps: int, num_train_timesteps: int, shift: float, **kwargs) -> np.ndarray:
+        alphas = np.linspace(1, 1 / num_train_timesteps, num_inference_steps + 1)
+        sigmas = 1.0 - alphas
+        sigmas = np.flip(shift * sigmas / (1 + (shift - 1) * sigmas))[:-1].copy()
+        return sigmas
+
+
 BASE_SCHEDULE_MAP = {
     "FlowMatchHunyuanVideo": FlowMatchHunyuanVideo,
     "FlowMatchLinearQuadratic": FlowMatchLinearQuadratic,
     "FlowMatchFlux": FlowMatchFlux,
     "FlowMatchSD3": FlowMatchSD3,
+    "FlowMatchSANA": FlowMatchSANA,
 }
 
-class FlowMatchSchedule:
 
+class FlowMatchSchedule:
     scale_model_input = False
 
     base_schedules = BASE_SCHEDULE_MAP
@@ -145,7 +169,7 @@ def __call__(
     ):
         shift = shift or self.shift
         if self.use_dynamic_shifting and mu is None:
-            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+            raise ValueError("You have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
 
         if sigmas is None:
             sigmas = self.base_schedule(
@@ -155,9 +179,8 @@ def __call__(
                 use_dynamic_shifting=self.use_dynamic_shifting,
             )
         else:
+            # NOTE: current usage is **without** `sigma_last` - different than BetaSchedule
             sigmas = np.array(sigmas).astype(np.float32)
-            num_inference_steps = len(sigmas)
-        self.num_inference_steps = num_inference_steps
 
         if self.use_dynamic_shifting:
             sigmas = self.time_shift(mu, 1.0, sigmas)
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -384,7 +384,10 @@ def add_noise(
         while len(sigma.shape) < len(original_samples.shape):
             sigma = sigma.unsqueeze(-1)
 
-        noisy_samples = original_samples + noise * sigma
+        if self._schedule.__class__.__name__ == "FlowMatchSchedule":
+            noisy_samples = (1.0 - sigma) * original_samples + noise * sigma
+        else:
+            noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
     def __len__(self):
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -413,7 +413,10 @@ def add_noise(
         while len(sigma.shape) < len(original_samples.shape):
             sigma = sigma.unsqueeze(-1)
 
-        noisy_samples = original_samples + noise * sigma
+        if self._schedule.__class__.__name__ == "FlowMatchSchedule":
+            noisy_samples = (1.0 - sigma) * original_samples + noise * sigma
+        else:
+            noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -401,7 +401,10 @@ def add_noise(
         while len(sigma.shape) < len(original_samples.shape):
             sigma = sigma.unsqueeze(-1)
 
-        noisy_samples = original_samples + noise * sigma
+        if self._schedule.__class__.__name__ == "FlowMatchSchedule":
+            noisy_samples = (1.0 - sigma) * original_samples + noise * sigma
+        else:
+            noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
     def __len__(self):