maybe

hlky · hlky · commit 094e2162463b · 2024-10-16T00:52:06.000+01:00
diff --git a/src/diffusers/pipelines/flux/pipeline_flux_rfinversion_edit.py b/src/diffusers/pipelines/flux/pipeline_flux_rfinversion_edit.py
@@ -681,7 +681,27 @@ def __call__(
             generator,
             latents,
         )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
         image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
 
         # handle guidance
         if self.transformer.config.guidance_embeds:
@@ -690,35 +710,18 @@ def __call__(
         else:
             guidance = None
 
-        import math
-        def time_shift(mu: float, sigma: float, t: torch.Tensor):
-            return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-
-
-        def get_lin_function(
-            x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
-        ) -> Callable[[float], float]:
-            m = (y2 - y1) / (x2 - x1)
-            b = y1 - m * x1
-            return lambda x: m * x + b
-    
-        mu = get_lin_function()(image_seq_len)
-        timesteps = torch.linspace(0, 1, num_inference_steps+1)
-        timesteps = time_shift(mu, 1.0, timesteps).to(latents.device, latents.dtype)
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i in range(num_inference_steps):
+            for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
-                t = torch.tensor([timesteps[i]], device=latents.device, dtype=latents.dtype)
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
                 timestep = 1-timestep
 
                 control_guidance = controller_guidance if i < stop_step else 0.0
                 unconditional_vector_field = -self.transformer(
                     hidden_states=latents,
-                    timestep=timestep,
+                    timestep=timestep / 1000,
                     guidance=guidance,
                     pooled_projections=pooled_prompt_embeds,
                     encoder_hidden_states=prompt_embeds,
@@ -731,10 +734,7 @@ def get_lin_function(
                 conditional_vector_field = (reference_image - latents) / timestep
                 controlled_vector_field = unconditional_vector_field + control_guidance * (conditional_vector_field - unconditional_vector_field)
 
-                sigma = timesteps[i]
-                sigma_next = timesteps[i+1]
-                latents = latents + (sigma_next - sigma) * controlled_vector_field
-
+                latents = self.scheduler.step(controlled_vector_field, t, latents, return_dict=False)[0]
 
         if output_type == "latent":
             image = latents
diff --git a/src/diffusers/pipelines/flux/pipeline_flux_rfinversion_noise.py b/src/diffusers/pipelines/flux/pipeline_flux_rfinversion_noise.py
@@ -550,8 +550,7 @@ def prepare_latents(
 
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         import numpy as np
-        sigma = timestep[0]
-        latents = sigma * noise + (1.0 - sigma) * image_latents
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
         latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
         np.save("reference_image_latent.npy", latents.detach().cpu().float().numpy())
         return latents, latent_image_ids
@@ -761,23 +760,26 @@ def __call__(
             max_sequence_length=max_sequence_length,
             lora_scale=lora_scale,
         )
-        import math
-        def time_shift(mu: float, sigma: float, t: torch.Tensor):
-            return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 
-
-        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
-        def get_lin_function(
-            x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
-        ) -> Callable[[float], float]:
-            m = (y2 - y1) / (x2 - x1)
-            b = y1 - m * x1
-            return lambda x: m * x + b
-    
-        mu = get_lin_function()(image_seq_len)
-        timesteps = torch.linspace(0, 1, num_inference_steps+1)
-        timesteps = time_shift(mu, 1.0, timesteps).to("cuda", torch.bfloat16)
         # 4.Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
         if num_inference_steps < 1:
             raise ValueError(
@@ -788,9 +790,10 @@ def get_lin_function(
 
         # 5. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
+
         latents, latent_image_ids = self.prepare_latents(
             init_image,
-            timesteps,
+            latent_timestep,
             batch_size * num_images_per_prompt,
             num_channels_latents,
             height,
@@ -815,15 +818,13 @@ def get_lin_function(
         y1 = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i in range(num_inference_steps - stop_step):
+            for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
-                t = torch.tensor([timesteps[i]], device=latents.device, dtype=latents.dtype)
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
                 noise_pred = self.transformer(
                     hidden_states=latents,
-                    timestep=timestep,
+                    timestep=timestep / 1000,
                     guidance=guidance,
                     pooled_projections=pooled_prompt_embeds,
                     encoder_hidden_states=prompt_embeds,
@@ -837,10 +838,7 @@ def get_lin_function(
                 conditional_vector_field = (y1-latents)/(1-timestep)
                 controlled_vector_field = unconditional_vector_field + controller_guidance * (conditional_vector_field - unconditional_vector_field)
 
-                # Get the corresponding sigma values
-                sigma = timesteps[i]
-                sigma_next = timesteps[i+1]
-                latents = latents + (sigma_next - sigma) * controlled_vector_field
+                latents = self.scheduler.step(controlled_vector_field, t, latents, return_dict=False)[0]
 
                 if XLA_AVAILABLE:
                     xm.mark_step()