refactor pipeline

a-r-r-o-w · a-r-r-o-w · commit d2f3a86d73f9 · 2025-02-28T12:12:44.000+01:00
diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py
@@ -199,6 +199,7 @@ def encode_prompt(
         self,
         prompt: Union[str, List[str]],
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = False,
         num_videos_per_prompt: int = 1,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
@@ -216,6 +217,8 @@ def encode_prompt(
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
             prompt_embeds (`torch.Tensor`, *optional*):
@@ -247,7 +250,7 @@ def encode_prompt(
                 dtype=dtype,
             )
 
-        if negative_prompt_embeds is None:
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
@@ -348,6 +351,10 @@ def prepare_latents(
     def guidance_scale(self):
         return self._guidance_scale
 
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
     @property
     def num_timesteps(self):
         return self._num_timesteps
@@ -377,7 +384,6 @@ def __call__(
         latents: Optional[torch.Tensor] = None,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_attention_mask: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "np",
         return_dict: bool = True,
         callback_on_step_end: Optional[
@@ -477,6 +483,7 @@ def __call__(
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
             num_videos_per_prompt=num_videos_per_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
@@ -486,7 +493,8 @@ def __call__(
 
         transformer_dtype = self.transformer.dtype
         prompt_embeds = prompt_embeds.to(transformer_dtype)
-        negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
 
         # 4. Prepare timesteps
         self.scheduler.flow_shift = flow_shift
@@ -523,22 +531,22 @@ def __call__(
                 latent_model_input = latents.to(transformer_dtype)
                 timestep = t.expand(latents.shape[0]).to(transformer_dtype)
 
-                noise_pred = self.transformer(
+                noise_cond = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
                     encoder_hidden_states=prompt_embeds,
                     return_dict=False,
                 )[0]
 
-                noise_pred_negative = self.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=negative_prompt_embeds,
-                    return_dict=False,
-                )[0]
-
-                noise_pred = noise_pred_negative + guidance_scale * (noise_pred - noise_pred_negative)
-
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
+                
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
 
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -242,6 +242,7 @@ def encode_prompt(
         self,
         prompt: Union[str, List[str]],
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = False,
         num_videos_per_prompt: int = 1,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
@@ -259,6 +260,8 @@ def encode_prompt(
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
             prompt_embeds (`torch.Tensor`, *optional*):
@@ -290,7 +293,7 @@ def encode_prompt(
                 dtype=dtype,
             )
 
-        if negative_prompt_embeds is None:
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
@@ -439,6 +442,10 @@ def prepare_latents(
     def guidance_scale(self):
         return self._guidance_scale
 
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
     @property
     def num_timesteps(self):
         return self._num_timesteps
@@ -468,15 +475,13 @@ def __call__(
         latents: Optional[torch.Tensor] = None,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_attention_mask: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "np",
         return_dict: bool = True,
         callback_on_step_end: Optional[
             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-        autocast_dtype: torch.dtype = torch.bfloat16,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -571,20 +576,22 @@ def __call__(
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
             num_videos_per_prompt=num_videos_per_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
             max_sequence_length=max_sequence_length,
             device=device,
-            dtype=autocast_dtype,
         )
-        # encode image embedding
+        
+        # Encode image embedding
         image_embeds = self.encode_image(image)
         image_embeds = image_embeds.repeat(batch_size, 1, 1)
 
-        prompt_embeds = prompt_embeds.to(autocast_dtype)
-        negative_prompt_embeds = negative_prompt_embeds.to(autocast_dtype)
-        image_embeds = image_embeds.to(autocast_dtype)
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        image_embeds = image_embeds.to(transformer_dtype)
 
         # 4. Prepare timesteps
         self.scheduler.flow_shift = flow_shift
@@ -596,6 +603,7 @@ def __call__(
             height, width = image.shape[-2:]
         else:
             width, height = image.size
+        
         # 5. Prepare latent variables
         num_channels_latents = self.vae.config.z_dim
         num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
@@ -618,37 +626,32 @@ def __call__(
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
 
-        with (
-            self.progress_bar(total=num_inference_steps) as progress_bar,
-            amp.autocast('cuda', dtype=autocast_dtype, cache_enabled=False)
-        ):
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
 
                 self._current_timestep = t
-                latent_model_input = latents
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0])
+                latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+                timestep = t.expand(latents.shape[0]).to(transformer_dtype)
 
-                noise_pred = self.transformer(
-                    hidden_states=torch.concat([latent_model_input, condition], dim=1),
+                noise_cond = self.transformer(
+                    hidden_states=latent_model_input,
                     timestep=timestep,
                     encoder_hidden_states=prompt_embeds,
                     encoder_hidden_states_image=image_embeds,
                     return_dict=False,
                 )[0]
 
-                noise_pred_negative = self.transformer(
-                    hidden_states=torch.concat([latent_model_input, condition], dim=1),
-                    timestep=timestep,
-                    encoder_hidden_states=negative_prompt_embeds,
-                    encoder_hidden_states_image=image_embeds,
-                    return_dict=False,
-                )[0]
-
-                noise_pred = noise_pred_negative + guidance_scale * (
-                        noise_pred - noise_pred_negative)
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        encoder_hidden_states_image=image_embeds,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]