cleanup + detail on neg_encoder_hidden_states

miguelmartin75 · miguelmartin75 · commit f928f3f02997 · 2026-01-23T01:23:17.000Z
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -241,12 +241,11 @@ def __init__(self):
 
     def compute_attn_i2v(
         self,
-        attn: Attention,  # TODO: CosmosAttention
+        attn: Attention,
         hidden_states: torch.Tensor,
         img_context=None,
         attention_mask=None,
     ):
-        print("compute_attn_i2v", flush=True)
         q_img = attn.q_img(hidden_states)
         k_img = attn.k_img(img_context)
         v_img = attn.v_img(img_context)
@@ -294,10 +293,7 @@ def __call__(
             image_rotary_emb=image_rotary_emb,
         )
 
-        # TODO: fixme
-        # NOTE: img_context should be zeros
         if img_context is not None:
-            print("compute_attn_i2v", flush=True)
             img_out = self.compute_attn_i2v(
                 attn=attn,
                 hidden_states=hidden_states,
@@ -422,7 +418,7 @@ def forward(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.before_proj is not None:
             hidden_states = self.before_proj(hidden_states) + latents
-            print(f"before_proj, block_idx={block_idx}")
+            # print(f"before_proj, block_idx={block_idx}")
 
         if extra_pos_emb is not None:
             hidden_states = hidden_states + extra_pos_emb
@@ -444,17 +440,18 @@ def forward(
         ff_output = self.ff(norm_hidden_states)
         hidden_states = hidden_states + gate * ff_output
 
+        if controlnet_residual is not None:
+            assert self.after_proj is None
+            # NOTE: this is assumed to be scaled by the controlnet
+            # print("controlnet_residual", flush=True)
+            hidden_states += controlnet_residual
+
         if self.after_proj is not None:
             assert controlnet_residual is None
             hs_proj = self.after_proj(hidden_states)
-            print(f"after_proj, block_idx={block_idx}")
+            # print(f"after_proj, block_idx={block_idx}")
             return hidden_states, hs_proj
 
-        if controlnet_residual is not None:
-            # NOTE: this is assumed to be scaled by the controlnet
-            print("controlnet_residual", flush=True)
-            hidden_states += controlnet_residual
-
         return hidden_states
 
 
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py
@@ -611,7 +611,6 @@ def __call__(
         width: Optional[int] = None,
         num_frames: int = 93,
         num_inference_steps: int = 36,
-        # guidance_scale: float = 7.0,  # TODO: check default
         guidance_scale: float = 3.0,
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -658,7 +657,7 @@ def __call__(
             num_inference_steps (`int`, defaults to `35`):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
-            guidance_scale (`float`, defaults to `7.0`):
+            guidance_scale (`float`, defaults to `3.0`):
                 Guidance scale as defined in [Classifier-Free Diffusion
                 Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
@@ -773,13 +772,16 @@ def __call__(
             device=device,
             max_sequence_length=max_sequence_length,
         )
-        # TODO(migmartin): add img ref to prompt_embeds via siglip if provided
-        encoder_hidden_states = (prompt_embeds, None)
-        neg_encoder_hidden_states = (negative_prompt_embeds, None)
 
         vae_dtype = self.vae.dtype
         transformer_dtype = self.transformer.dtype
 
+        # TODO(migmartin): add img ref to prompt_embeds via siglip if image ref is provided
+        img_context_ref = torch.zeros(1, 256, 1152).to(device=prompt_embeds.device, dtype=transformer_dtype)
+        encoder_hidden_states = (prompt_embeds, img_context_ref)
+        # NOTE: rojects/cosmos/transfer2/configs/vid2vid_transfer/defaults/conditioner.py L240
+        neg_encoder_hidden_states = (negative_prompt_embeds, None)
+
         num_frames_in = None
         if image is not None:
             if batch_size != 1: