wip

miguelmartin75 · miguelmartin75 · commit 5c6dd8690ae6 · 2026-01-16T02:06:58.000Z
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -594,14 +594,20 @@ def convert_controlnet(transformer_type: str, state_dict: Dict[str, Any], weight
         raise AssertionError(f"{transformer_type} does not define a ControlNet config")
 
     PREFIX_KEY = "net."
+    old2new = {}
+    new2old = {}
     for key in list(state_dict.keys()):
         new_key = key[:]
         if new_key.startswith(PREFIX_KEY):
             new_key = new_key.removeprefix(PREFIX_KEY)
         for replace_key, rename_key in CONTROLNET_KEYS_RENAME_DICT.items():
             new_key = new_key.replace(replace_key, rename_key)
+        old2new[key] = new_key
+        new2old[new_key] = key
         update_state_dict_(state_dict, key, new_key)
 
+    breakpoint()
+
     for key in list(state_dict.keys()):
         for special_key, handler_fn_inplace in CONTROLNET_SPECIAL_KEYS_REMAP.items():
             if special_key not in key:
diff --git a/src/diffusers/models/controlnets/__init__.py b/src/diffusers/models/controlnets/__init__.py
@@ -3,7 +3,7 @@
 
 if is_torch_available():
     from .controlnet import ControlNetModel, ControlNetOutput
-    from .controlnet_cosmos import CosmosControlNetModel, CosmosControlNetOutput
+    from .controlnet_cosmos import CosmosControlNetModel
     from .controlnet_flux import FluxControlNetModel, FluxControlNetOutput, FluxMultiControlNetModel
     from .controlnet_hunyuan import (
         HunyuanControlNetOutput,
diff --git a/src/diffusers/models/controlnets/controlnet_cosmos.py b/src/diffusers/models/controlnets/controlnet_cosmos.py
@@ -8,18 +8,15 @@
 from ...loaders import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
 from ..modeling_utils import ModelMixin
-from ..transformers.transformer_cosmos import CosmosPatchEmbed
+from ..transformers.transformer_cosmos import (
+    CosmosPatchEmbed,
+)
 from .controlnet import zero_module
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-@dataclass
-class CosmosControlNetOutput(BaseOutput):
-    block_controlnet_hidden_states: Tuple[torch.Tensor]
-
-
 class CosmosControlNetBlock(nn.Module):
     def __init__(self, hidden_size: int):
         super().__init__()
@@ -82,16 +79,12 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         conditioning_scale: Union[float, List[float]] = 1.0,
         return_dict: bool = True,
-    ) -> Union[Tuple[Tuple[torch.Tensor, ...]], CosmosControlNetOutput]:
+    ) -> List[torch.Tensor]:
         del hidden_states, timestep, encoder_hidden_states  # not used in this minimal control path
 
         control_hidden_states = self.patch_embed(controlnet_cond)
         control_hidden_states = control_hidden_states.flatten(1, 3)
 
         scales = self._expand_conditioning_scale(conditioning_scale)
         control_residuals = tuple(block(control_hidden_states) * scale for block, scale in zip(self.control_blocks, scales))
-
-        if not return_dict:
-            return (control_residuals,)
-
-        return CosmosControlNetOutput(block_controlnet_hidden_states=control_residuals)
+        return control_residuals
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -740,9 +740,8 @@ def forward(
             encoder_hidden_states = self.crossattn_proj(encoder_hidden_states)
 
         controlnet_block_index_map = {}
-        if block_controlnet_hidden_states:
+        if block_controlnet_hidden_states is not None:
             n_blocks = len(self.transformer_blocks)
-            # TODO: don't use a dict?
             controlnet_block_index_map = {
                 block_idx: block_controlnet_hidden_states[idx]
                 for idx, block_idx in list(enumerate(range(0, n_blocks, self.config.controlnet_block_every_n)))[0:self.config.n_controlnet_blocks]
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -433,9 +433,6 @@ def prepare_latents(
         else:
             if video is None:
                 raise ValueError("`video` must be provided when `num_frames_in` is greater than 0.")
-            needs_preprocessing = not (isinstance(video, torch.Tensor) and video.ndim == 5 and video.shape[1] == 3)
-            if needs_preprocessing:
-                video = self.video_processor.preprocess_video(video, height, width)
             video = video.to(device=device, dtype=self.vae.dtype)
             if isinstance(generator, list):
                 cond_latents = [
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py
@@ -199,7 +199,7 @@ def __init__(
         transformer: CosmosTransformer3DModel,
         vae: AutoencoderKLWan,
         scheduler: UniPCMultistepScheduler,
-        controlnet: Optional[CosmosControlNetModel] = None,
+        controlnet: CosmosControlNetModel,
         safety_checker: CosmosSafetyChecker = None,
     ):
         super().__init__()
@@ -474,23 +474,25 @@ def prepare_latents(
                 cond_indicator,
             )
 
-    def _encode_controlnet_image(
+    def _encode_controls(
         self,
-        control_image: Optional[torch.Tensor],
+        controls: Optional[torch.Tensor],
         height: int,
         width: int,
         num_frames: int,
         dtype: torch.dtype,
         device: torch.device,
     ) -> Optional[torch.Tensor]:
-        if control_image is None:
+        if controls is None:
             return None
 
-        control_video = self.video_processor.preprocess_video(control_image, height, width)
-        if control_video.shape[2] < num_frames:
-            n_pad_frames = num_frames - control_video.shape[2]
-            last_frame = control_video[:, :, -1:, :, :]
-            control_video = torch.cat((control_video, last_frame.repeat(1, 1, n_pad_frames, 1, 1)), dim=2)
+        # TODO: handle image differently?
+        control_video = self.video_processor.preprocess_video(controls, height, width)
+        # TODO: is this needed?
+        # if control_video.shape[2] < num_frames:
+        #     n_pad_frames = num_frames - control_video.shape[2]
+        #     last_frame = control_video[:, :, -1:, :, :]
+        #     control_video = torch.cat((control_video, last_frame.repeat(1, 1, n_pad_frames, 1, 1)), dim=2)
 
         control_video = control_video.to(device=device, dtype=self.vae.dtype)
         control_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0))) for vid in control_video]
@@ -568,8 +570,8 @@ def __call__(
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        controlnet_conditioning_image: Optional[PipelineImageInput] = None,
+        controls: Optional[PipelineImageInput | List[PipelineImageInput]] = None,
+        controls_conditioning_scale: Union[float, List[float]] = 1.0,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
@@ -623,10 +625,10 @@ def __call__(
                 Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to `1.0`):
-                The scale factor(s) for the ControlNet outputs. A single float is broadcast to all control blocks.
-            controlnet_conditioning_image (`PipelineImageInput`, *optional*):
+            controls (`PipelineImageInput`, `List[PipelineImageInput]`, *optional*):
                 Control image or video input used by the ControlNet. If `None`, ControlNet is skipped.
+            controls_conditioning_scale (`float` or `List[float]`, *optional*, defaults to `1.0`):
+                The scale factor(s) for the ControlNet outputs. A single float is broadcast to all control blocks.
             prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
@@ -765,19 +767,20 @@ def __call__(
         cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
         cond_mask = cond_mask.to(transformer_dtype)
 
-        controlnet_latents = None
-        if self.controlnet is not None and controlnet_conditioning_image is not None:
-            controlnet_latents = self._encode_controlnet_image(
-                control_image=controlnet_conditioning_image,
+        controls_latents = None
+        if controls is not None:
+            controls_latents = self._encode_controls(
+                controls,
                 height=height,
                 width=width,
                 num_frames=num_frames,
                 dtype=torch.float32,
                 device=device,
             )
-            if controlnet_latents.shape[0] != latents.shape[0]:
-                repeat_count = latents.shape[0] // controlnet_latents.shape[0]
-                controlnet_latents = controlnet_latents.repeat_interleave(repeat_count, dim=0)
+            # TODO: checkme?
+            # if controls_latents.shape[0] != latents.shape[0]:
+            #     repeat_count = latents.shape[0] // controls_latents.shape[0]
+            #     controls_latents = controls_latents.repeat_interleave(repeat_count, dim=0)
 
         padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
 
@@ -805,24 +808,24 @@ def __call__(
                 in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
                 in_latents = in_latents.to(transformer_dtype)
                 in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
-                control_block_samples = None
-                if self.controlnet is not None and controlnet_latents is not None:
-                    control_block_samples = self.controlnet(
+                control_blocks = None
+                if controls is not None:
+                    control_blocks = self.controlnet(
                         hidden_states=in_latents,
-                        controlnet_cond=controlnet_latents.to(dtype=transformer_dtype),
+                        controlnet_cond=controls_latents.to(dtype=transformer_dtype),
                         timestep=in_timestep,
                         encoder_hidden_states=prompt_embeds,
-                        conditioning_scale=controlnet_conditioning_scale,
+                        conditioning_scale=controls_conditioning_scale,
                         return_dict=True,
-                    ).block_controlnet_hidden_states
-                    control_block_samples = tuple(residual.to(dtype=transformer_dtype) for residual in control_block_samples)
+                    )
+
                 noise_pred = self.transformer(
                     hidden_states=in_latents,
                     condition_mask=cond_mask,
                     timestep=in_timestep,
                     encoder_hidden_states=prompt_embeds,
                     padding_mask=padding_mask,
-                    block_controlnet_hidden_states=control_block_samples,
+                    block_controlnet_hidden_states=control_blocks,
                     return_dict=False,
                 )[0]
                 # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
@@ -835,7 +838,7 @@ def __call__(
                         timestep=in_timestep,
                         encoder_hidden_states=negative_prompt_embeds,
                         padding_mask=padding_mask,
-                        block_controlnet_hidden_states=control_block_samples,
+                        block_controlnet_hidden_states=control_blocks,
                         return_dict=False,
                     )[0]
                     # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
@@ -868,7 +871,8 @@ def __call__(
             latents_std = self.latents_std.to(latents.device, latents.dtype)
             latents = latents * latents_std + latents_mean
             video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
-            video = self._match_num_frames(video, num_frames)
+            # TODO: checkme
+            # video = self._match_num_frames(video, num_frames)
 
             assert self.safety_checker is not None
             self.safety_checker.to(device)
@@ -892,6 +896,7 @@ def __call__(
 
         return CosmosPipelineOutput(frames=video)
 
+    # TODO: checkme - this seems like a hack
     def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
         if target_num_frames <= 0 or video.shape[2] == target_num_frames:
             return video
diff --git a/t25-depth-2b.yaml b/t25-depth-2b.yaml