revert change in text2img

yiyixuxu · yiyixuxu · commit 66a6d3f63091 · 2024-11-21T11:59:30.000+01:00
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -513,7 +513,7 @@ def prepare_latents(
         shape = (batch_size, num_channels_latents, height, width)
 
         if latents is not None:
-            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
             return latents.to(device=device, dtype=dtype), latent_image_ids
 
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -567,7 +567,6 @@ def __call__(
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-        img_cond: Optional[torch.FloatTensor] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -687,7 +686,7 @@ def __call__(
         )
 
         # 4. Prepare latent variables
-        num_channels_latents = self.transformer.config.out_channels // 4
+        num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
@@ -699,8 +698,6 @@ def __call__(
             latents,
         )
 
-        img_cond = img_cond.to(latents.device) if img_cond is not None else None
-
         # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
         image_seq_len = latents.shape[1]
@@ -739,7 +736,7 @@ def __call__(
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
 
                 noise_pred = self.transformer(
-                    hidden_states=torch.cat((latents, img_cond), dim=2) if img_cond is not None else latents,
+                    hidden_states=latents,
                     timestep=timestep / 1000,
                     guidance=guidance,
                     pooled_projections=pooled_prompt_embeds,