huggingface
diff --git a/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 9 additions & 8 deletions b/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused.py‎
Lines changed: 8 additions & 7 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused_img2img.py‎
Lines changed: 8 additions & 7 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused_img2img.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused_inpaint.py‎
Lines changed: 8 additions & 7 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused_inpaint.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py‎
Lines changed: 8 additions & 7 deletions b/‎src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/diffusers/pipelines/bria/pipeline_bria.py‎
Lines changed: 10 additions & 9 deletions b/‎src/diffusers/pipelines/bria/pipeline_bria.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py‎
Lines changed: 11 additions & 10 deletions b/‎src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎src/diffusers/pipelines/chroma/pipeline_chroma.py‎
Lines changed: 21 additions & 18 deletions b/‎src/diffusers/pipelines/chroma/pipeline_chroma.py‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py‎
Lines changed: 21 additions & 19 deletions b/‎src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py‎
Lines changed: 21 additions & 19 deletions
diff --git a/‎src/diffusers/pipelines/chronoedit/pipeline_chronoedit.py‎
Lines changed: 14 additions & 12 deletions b/‎src/diffusers/pipelines/chronoedit/pipeline_chronoedit.py‎
Lines changed: 14 additions & 12 deletions
@@ -929,14 +929,15 @@ def __call__(
                 timestep = t.expand(latent_model_input.shape[0])
 
                 # predict noise model_output
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    encoder_hidden_states=prompt_embeds,
-                    encoder_attention_mask=prompt_attention_mask,
-                    timestep=timestep,
-                    image_rotary_emb=image_rotary_emb,
-                    return_dict=False,
-                )[0]
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        encoder_attention_mask=prompt_attention_mask,
+                        timestep=timestep,
+                        image_rotary_emb=image_rotary_emb,
+                        return_dict=False,
+                    )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
 
@@ -281,13 +281,14 @@ def __call__(
                 else:
                     model_input = latents
 
-                model_output = self.transformer(
-                    model_input,
-                    micro_conds=micro_conds,
-                    pooled_text_emb=prompt_embeds,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
+                with self.transformer.cache_context("cond"):
+                    model_output = self.transformer(
+                        model_input,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=prompt_embeds,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
 
                 if guidance_scale > 1.0:
                     uncond_logits, cond_logits = model_output.chunk(2)
 
@@ -309,13 +309,14 @@ def __call__(
                 else:
                     model_input = latents
 
-                model_output = self.transformer(
-                    model_input,
-                    micro_conds=micro_conds,
-                    pooled_text_emb=prompt_embeds,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
+                with self.transformer.cache_context("cond"):
+                    model_output = self.transformer(
+                        model_input,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=prompt_embeds,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
 
                 if guidance_scale > 1.0:
                     uncond_logits, cond_logits = model_output.chunk(2)
 
@@ -339,13 +339,14 @@ def __call__(
                 else:
                     model_input = latents
 
-                model_output = self.transformer(
-                    model_input,
-                    micro_conds=micro_conds,
-                    pooled_text_emb=prompt_embeds,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
+                with self.transformer.cache_context("cond"):
+                    model_output = self.transformer(
+                        model_input,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=prompt_embeds,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
 
                 if guidance_scale > 1.0:
                     uncond_logits, cond_logits = model_output.chunk(2)
 
@@ -615,13 +615,14 @@ def __call__(
                 timestep = timestep.to(latents.device, dtype=latents.dtype)
 
                 # predict noise model_output
-                noise_pred = self.transformer(
-                    latent_model_input,
-                    encoder_hidden_states=prompt_embeds,
-                    timestep=timestep,
-                    return_dict=False,
-                    attention_kwargs=self.attention_kwargs,
-                )[0]
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep=timestep,
+                        return_dict=False,
+                        attention_kwargs=self.attention_kwargs,
+                    )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
 
@@ -662,15 +662,16 @@ def __call__(
                 timestep = t.expand(latent_model_input.shape[0])
 
                 # This is predicts "v" from flow-matching or eps from diffusion
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    attention_kwargs=self.attention_kwargs,
-                    return_dict=False,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                )[0]
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        attention_kwargs=self.attention_kwargs,
+                        return_dict=False,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                    )[0]
 
                 # perform guidance
                 if self.do_classifier_free_guidance:
 
@@ -705,16 +705,17 @@ def __call__(
                 )
 
                 # This is predicts "v" from flow-matching or eps from diffusion
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    text_encoder_layers=prompt_layers,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                )[0]
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        text_encoder_layers=prompt_layers,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                    )[0]
 
                 # perform guidance
                 if guidance_scale > 1:
 
@@ -906,30 +906,33 @@ def __call__(
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
 
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    attention_mask=attention_mask,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                if self.do_classifier_free_guidance:
-                    if negative_image_embeds is not None:
-                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-                    neg_noise_pred = self.transformer(
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
                         img_ids=latent_image_ids,
-                        attention_mask=negative_attention_mask,
+                        attention_mask=attention_mask,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
                     )[0]
+
+                if self.do_classifier_free_guidance:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            attention_mask=negative_attention_mask,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )[0]
                     noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
 
                 # compute the previous noisy sample x_t -> x_t-1
 
@@ -989,31 +989,33 @@ def __call__(
                 if image_embeds is not None:
                     self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
 
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    attention_mask=attention_mask,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                if self.do_classifier_free_guidance:
-                    if negative_image_embeds is not None:
-                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-
-                    noise_pred_uncond = self.transformer(
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
                         img_ids=latent_image_ids,
-                        attention_mask=negative_attention_mask,
+                        attention_mask=attention_mask,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
                     )[0]
+
+                if self.do_classifier_free_guidance:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+
+                    with self.transformer.cache_context("uncond"):
+                        noise_pred_uncond = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            attention_mask=negative_attention_mask,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )[0]
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
 
@@ -680,24 +680,26 @@ def __call__(
                 latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
                 timestep = t.expand(latents.shape[0])
 
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    encoder_hidden_states_image=image_embeds,
-                    attention_kwargs=attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                if self.do_classifier_free_guidance:
-                    noise_uncond = self.transformer(
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
                         hidden_states=latent_model_input,
                         timestep=timestep,
-                        encoder_hidden_states=negative_prompt_embeds,
+                        encoder_hidden_states=prompt_embeds,
                         encoder_hidden_states_image=image_embeds,
                         attention_kwargs=attention_kwargs,
                         return_dict=False,
                     )[0]
+
+                if self.do_classifier_free_guidance:
+                    with self.transformer.cache_context("uncond"):
+                        noise_uncond = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            encoder_hidden_states_image=image_embeds,
+                            attention_kwargs=attention_kwargs,
+                            return_dict=False,
+                        )[0]
                     noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1