formatting

miguelmartin75 · miguelmartin75 · commit 446e6eabb1d7 · 2026-02-02T19:45:09.000Z
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -159,9 +159,11 @@
 def remove_keys_(key: str, state_dict: Dict[str, Any]):
     state_dict.pop(key)
 
+
 def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
     state_dict[new_key] = state_dict.pop(old_key)
 
+
 def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
     block_index = int(key.split(".")[1].removeprefix("block"))
     new_key = key
@@ -459,9 +461,7 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
 }
 
 
-CONTROLNET_SPECIAL_KEYS_REMAP = {
-    **TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
-}
+CONTROLNET_SPECIAL_KEYS_REMAP = {**TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0}
 
 VAE_KEYS_RENAME_DICT = {
     "down.0": "down_blocks.0",
@@ -553,7 +553,9 @@ def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def convert_transformer(
-    transformer_type: str, state_dict: Optional[Dict[str, Any]] = None, weights_only: bool = True,
+    transformer_type: str,
+    state_dict: Optional[Dict[str, Any]] = None,
+    weights_only: bool = True,
 ):
     PREFIX_KEY = "net."
 
@@ -613,7 +615,12 @@ def convert_transformer(
     return transformer
 
 
-def convert_controlnet(transformer_type: str, control_state_dict: Dict[str, Any], base_state_dict: Dict[str, Any], weights_only: bool = True):
+def convert_controlnet(
+    transformer_type: str,
+    control_state_dict: Dict[str, Any],
+    base_state_dict: Dict[str, Any],
+    weights_only: bool = True,
+):
     """
     Convert controlnet weights.
 
@@ -657,7 +664,7 @@ def convert_controlnet(transformer_type: str, control_state_dict: Dict[str, Any]
     for key in list(base_state_dict.keys()):
         for transformer_prefix, controlnet_prefix in shared_module_mappings.items():
             if key.startswith(transformer_prefix):
-                controlnet_key = controlnet_prefix + key[len(transformer_prefix):]
+                controlnet_key = controlnet_prefix + key[len(transformer_prefix) :]
                 control_state_dict[controlnet_key] = base_state_dict[key].clone()
                 print(f"Copied shared weight: {key} -> {controlnet_key}", flush=True)
                 break
@@ -864,7 +871,9 @@ def get_args():
     raw_state_dict = None
     if args.transformer_ckpt_path is not None:
         weights_only = "Cosmos-1.0" in args.transformer_type
-        raw_state_dict = get_state_dict(torch.load(args.transformer_ckpt_path, map_location="cpu", weights_only=weights_only))
+        raw_state_dict = get_state_dict(
+            torch.load(args.transformer_ckpt_path, map_location="cpu", weights_only=weights_only)
+        )
 
     if raw_state_dict is not None:
         if "Transfer" in args.transformer_type:
@@ -879,14 +888,18 @@ def get_args():
             assert len(base_state_dict.keys() & control_state_dict.keys()) == 0
 
             # Convert transformer first to get the processed base state dict
-            transformer = convert_transformer(args.transformer_type, state_dict=base_state_dict, weights_only=weights_only)
+            transformer = convert_transformer(
+                args.transformer_type, state_dict=base_state_dict, weights_only=weights_only
+            )
             transformer = transformer.to(dtype=dtype)
 
             # Get converted transformer state dict to copy shared weights to controlnet
             converted_base_state_dict = transformer.state_dict()
 
             # Convert controlnet with both control-specific and shared weights from transformer
-            controlnet = convert_controlnet(args.transformer_type, control_state_dict, converted_base_state_dict, weights_only=weights_only)
+            controlnet = convert_controlnet(
+                args.transformer_type, control_state_dict, converted_base_state_dict, weights_only=weights_only
+            )
             controlnet = controlnet.to(dtype=dtype)
 
             if not args.save_pipeline:
@@ -895,7 +908,9 @@ def get_args():
                     pathlib.Path(args.output_path) / "controlnet", safe_serialization=True, max_shard_size="5GB"
                 )
         else:
-            transformer = convert_transformer(args.transformer_type, state_dict=raw_state_dict, weights_only=weights_only)
+            transformer = convert_transformer(
+                args.transformer_type, state_dict=raw_state_dict, weights_only=weights_only
+            )
             transformer = transformer.to(dtype=dtype)
             if not args.save_pipeline:
                 transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
diff --git a/src/diffusers/models/controlnets/controlnet_cosmos.py b/src/diffusers/models/controlnets/controlnet_cosmos.py
@@ -41,8 +41,8 @@ class CosmosControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     ControlNet for Cosmos Transfer2.5.
 
     This model duplicates the shared embedding modules from the transformer (patch_embed, time_embed,
-    learnable_pos_embed, img_context_proj) to enable proper CPU offloading. The forward() method
-    computes everything internally from raw inputs.
+    learnable_pos_embed, img_context_proj) to enable proper CPU offloading. The forward() method computes everything
+    internally from raw inputs.
     """
 
     _supports_gradient_checkpointing = True
@@ -184,7 +184,9 @@ def forward(
             control_hidden_states = torch.cat(
                 [
                     control_hidden_states,
-                    torch.zeros((B, pad_C, T, H, W), dtype=control_hidden_states.dtype, device=control_hidden_states.device),
+                    torch.zeros(
+                        (B, pad_C, T, H, W), dtype=control_hidden_states.dtype, device=control_hidden_states.device
+                    ),
                 ],
                 dim=1,
             )
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -225,13 +225,11 @@ def __call__(
 
         return hidden_states
 
+
 class CosmosAttnProcessor2_5(CosmosAttnProcessor2_0):
     def __init__(self):
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
-            raise ImportError(
-                "CosmosAttnProcessor2_5 requires PyTorch 2.0. "
-                "Please upgrade PyTorch to 2.0 or newer."
-            )
+            raise ImportError("CosmosAttnProcessor2_5 requires PyTorch 2.0. Please upgrade PyTorch to 2.0 or newer.")
 
     def compute_attn_i2v(
         self,
@@ -302,6 +300,7 @@ def __call__(
         hidden_states = attn.to_out[1](hidden_states)
         return hidden_states
 
+
 class CosmosAttention(Attention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -400,7 +399,9 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Union[Optional[torch.Tensor], Optional[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]],
+        encoder_hidden_states: Union[
+            Optional[torch.Tensor], Optional[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]
+        ],
         embedded_timestep: torch.Tensor,
         temb: Optional[torch.Tensor] = None,
         image_rotary_emb: Optional[torch.Tensor] = None,
@@ -581,11 +582,11 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         img_context_dim_in (`int`, *optional*):
             The dimension of the input image context feature vector, i.e. it is the D in [B, N, D].
         img_context_num_tokens (`int`):
-            The number of tokens in the image context feature vector, i.e. it is
-            the N in [B, N, D]. If `img_context_dim_in` is not provided, then this parameter is ignored.
-        img_context_dim_out (`int`):
-            The output dimension of the image context projection layer. If
+            The number of tokens in the image context feature vector, i.e. it is the N in [B, N, D]. If
             `img_context_dim_in` is not provided, then this parameter is ignored.
+        img_context_dim_out (`int`):
+            The output dimension of the image context projection layer. If `img_context_dim_in` is not provided, then
+            this parameter is ignored.
     """
 
     _supports_gradient_checkpointing = True
@@ -739,14 +740,18 @@ def forward(
             raise ValueError(f"Expected timestep to have shape [B, 1, T, 1, 1] or [T], but got {timestep.shape}")
 
         # 5. Process encoder hidden states
-        text_context, img_context = encoder_hidden_states if isinstance(encoder_hidden_states, tuple) else (encoder_hidden_states, None)
+        text_context, img_context = (
+            encoder_hidden_states if isinstance(encoder_hidden_states, tuple) else (encoder_hidden_states, None)
+        )
         if self.config.use_crossattn_projection:
             text_context = self.crossattn_proj(text_context)
 
         if img_context is not None and self.config.img_context_dim_in:
             img_context = self.img_context_proj(img_context)
 
-        processed_encoder_hidden_states = (text_context, img_context) if isinstance(encoder_hidden_states, tuple) else text_context
+        processed_encoder_hidden_states = (
+            (text_context, img_context) if isinstance(encoder_hidden_states, tuple) else text_context
+        )
 
         # 6. Build controlnet block index map
         controlnet_block_index_map = {}
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py
@@ -53,13 +53,15 @@ def __init__(self, *args, **kwargs):
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
 def _maybe_pad_video(video: torch.Tensor, num_frames: int):
     n_pad_frames = num_frames - video.shape[2]
     if n_pad_frames > 0:
         last_frame = video[:, :, -1:, :, :]
         video = torch.cat((video, last_frame.repeat(1, 1, n_pad_frames, 1, 1)), dim=2)
     return video
 
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -73,6 +75,7 @@ def retrieve_latents(
     else:
         raise AttributeError("Could not access latents of provided encoder_output")
 
+
 def transfer2_5_forward(
     transformer: CosmosTransformer3DModel,
     controlnet: CosmosControlNetModel,
@@ -87,9 +90,9 @@ def transfer2_5_forward(
     """
     Forward pass for Transfer2.5 pipeline.
 
-    This function calls both transformer and controlnet's forward() methods directly,
-    enabling proper CPU offloading. The controlnet computes its own embeddings internally
-    using duplicated modules (patch_embed_base, time_embed, etc.).
+    This function calls both transformer and controlnet's forward() methods directly, enabling proper CPU offloading.
+    The controlnet computes its own embeddings internally using duplicated modules (patch_embed_base, time_embed,
+    etc.).
 
     Args:
         transformer: The CosmosTransformer3DModel
@@ -130,6 +133,7 @@ def transfer2_5_forward(
     )[0]
     return noise_pred
 
+
 DEFAULT_NEGATIVE_PROMPT = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."
 
 EXAMPLE_DOC_STRING = """
@@ -501,7 +505,9 @@ def _encode_controls(
         control_video = _maybe_pad_video(control_video, num_frames)
 
         control_video = control_video.to(device=device, dtype=self.vae.dtype)
-        control_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator=generator) for vid in control_video]
+        control_latents = [
+            retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator=generator) for vid in control_video
+        ]
         control_latents = torch.cat(control_latents, dim=0).to(dtype)
 
         latents_mean = self.latents_mean.to(device=device, dtype=dtype)
@@ -611,7 +617,8 @@ def __call__(
             height (`int`, defaults to `704`):
                 The height in pixels of the generated image.
             width (`int`, *optional*):
-                The width in pixels of the generated image. If not provided, this will be determined based on the aspect ratio of the input and the provided height.
+                The width in pixels of the generated image. If not provided, this will be determined based on the
+                aspect ratio of the input and the provided height.
             num_frames (`int`, defaults to `93`):
                 Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
             num_inference_steps (`int`, defaults to `35`):
@@ -684,7 +691,7 @@ def __call__(
                     frame = controls[0]
 
             if frame is None:
-                width = int((height + 16) * (1280/720))
+                width = int((height + 16) * (1280 / 720))
             elif isinstance(frame, PIL.Image.Image):
                 width = int((height + 16) * (frame.width / frame.height))
             else:
@@ -839,7 +846,7 @@ def __call__(
                     in_timestep=in_timestep,
                     encoder_hidden_states=encoder_hidden_states,
                     cond_mask=cond_mask,
-                    padding_mask=padding_mask
+                    padding_mask=padding_mask,
                 )
                 noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
 
@@ -853,7 +860,7 @@ def __call__(
                         in_timestep=in_timestep,
                         encoder_hidden_states=neg_encoder_hidden_states,  # NOTE: negative prompt
                         cond_mask=cond_mask,
-                        padding_mask=padding_mask
+                        padding_mask=padding_mask,
                     )
                     # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
                     noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
diff --git a/tests/pipelines/cosmos/test_cosmos2_5_transfer.py b/tests/pipelines/cosmos/test_cosmos2_5_transfer.py
@@ -384,4 +384,3 @@ def test_save_load_optional_components(self, expected_max_difference=1e-4):
     )
     def test_encode_prompt_works_in_isolation(self):
         pass
-

Original file line number	Diff line number	Diff line change
`@@ -384,4 +384,3 @@ def test_save_load_optional_components(self, expected_max_difference=1e-4):`
`384`	`384`	`)`
`385`	`385`	`def test_encode_prompt_works_in_isolation(self):`
`386`	`386`	`pass`
`387`		`-`