convert controlnet

miguelmartin75 · miguelmartin75 · commit 5705dfaf1584 · 2026-01-20T18:05:50.000Z
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -393,50 +393,35 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
         "use_crossattn_projection": True,
         "crossattn_proj_in_channels": 100352,
         "encoder_hidden_states_channels": 1024,
-        "n_control_net_blocks": 4,
         "controlnet_block_every_n": 7,
         "img_context_dim": 1152,
     },
 }
 
 CONTROLNET_CONFIGS = {
     "Cosmos-2.5-Transfer-General-2B": {
-        "in_channels": 16 + 1,
+        "n_controlnet_blocks": 4,
+        "model_channels": 2048,
+        "in_channels": 130,
         "num_attention_heads": 16,
         "attention_head_dim": 128,
-        "num_layers": 4,
+        "mlp_ratio": 4.0,
+        "text_embed_dim": 1024,
+        "adaln_lora_dim": 256,
         "patch_size": (1, 2, 2),
-        "control_block_indices": (6, 13, 20, 27),
     },
 }
 
 # TODO(migmartin): fix this, this is not correct
 CONTROLNET_KEYS_RENAME_DICT = {
-    "controlnet_blocks": "control_blocks",
-    "control_net_blocks": "control_blocks",
-    "control_blocks.block": "control_blocks.",
-    "control_blocks": "control_blocks",
-    ".linear": ".proj",
-    ".proj.0": ".proj",
-    ".proj.1": ".proj",
-    "x_embedder_control": "patch_embed",
-    "control_patch_embed": "patch_embed",
-    "controlnet_patch_embed": "patch_embed",
-    "control_embedder": "patch_embed",
+    **TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0,
+    "blocks": "blocks",
+    "control_embedder.proj.1": "patch_embed.proj",
 }
 
 
-def rename_controlnet_blocks_(key: str, state_dict: Dict[str, Any]):
-    block_index = int(key.split(".")[1].removeprefix("block"))
-    new_key = key
-    old_prefix = f"control_blocks.block{block_index}"
-    new_prefix = f"control_blocks.{block_index}"
-    new_key = new_prefix + new_key.removeprefix(old_prefix)
-    state_dict[new_key] = state_dict.pop(key)
-
-
 CONTROLNET_SPECIAL_KEYS_REMAP = {
-    "control_blocks.block": rename_controlnet_blocks_,
+    **TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
 }
 
 VAE_KEYS_RENAME_DICT = {
@@ -606,8 +591,6 @@ def convert_controlnet(transformer_type: str, state_dict: Dict[str, Any], weight
         new2old[new_key] = key
         update_state_dict_(state_dict, key, new_key)
 
-    breakpoint()
-
     for key in list(state_dict.keys()):
         for special_key, handler_fn_inplace in CONTROLNET_SPECIAL_KEYS_REMAP.items():
             if special_key not in key:
@@ -832,12 +815,12 @@ def get_args():
                     base_state_dict[k] = v
             assert len(base_state_dict.keys() & control_state_dict.keys()) == 0
 
-            transformer = convert_transformer(args.transformer_type, state_dict=base_state_dict, weights_only=weights_only)
-            transformer = transformer.to(dtype=dtype)
-
             controlnet = convert_controlnet(args.transformer_type, control_state_dict, weights_only=weights_only)
             controlnet = controlnet.to(dtype=dtype)
 
+            transformer = convert_transformer(args.transformer_type, state_dict=base_state_dict, weights_only=weights_only)
+            transformer = transformer.to(dtype=dtype)
+
             if not args.save_pipeline:
                 transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
                 controlnet.save_pretrained(
diff --git a/src/diffusers/models/controlnets/controlnet_cosmos.py b/src/diffusers/models/controlnets/controlnet_cosmos.py
@@ -10,49 +10,52 @@
 from ..modeling_utils import ModelMixin
 from ..transformers.transformer_cosmos import (
     CosmosPatchEmbed,
+    CosmosTransformerBlock,
 )
 from .controlnet import zero_module
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class CosmosControlNetBlock(nn.Module):
-    def __init__(self, hidden_size: int):
-        super().__init__()
-        self.proj = zero_module(nn.Linear(hidden_size, hidden_size, bias=True))
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return self.proj(hidden_states)
-
-
 # TODO(migmartin): implement me
 # see i4/projects/cosmos/transfer2/networks/minimal_v4_lvg_dit_control_vace.py
 class CosmosControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     r"""
-    Minimal ControlNet for Cosmos Transfer2.5.
-
-    This module projects encoded control latents into per-block residuals aligned with the
-    `CosmosTransformer3DModel` hidden size. All projections are zero-initialized so the ControlNet
-    starts neutral by default.
+    ControlNet for Cosmos Transfer2.5.
     """
 
     @register_to_config
     def __init__(
         self,
+        n_controlnet_blocks: int = 4,
         in_channels: int = 16,
+        model_channels: int = 2048,
         num_attention_heads: int = 32,
         attention_head_dim: int = 128,
-        num_layers: int = 4,
+        mlp_ratio: float = 4.0,
+        text_embed_dim: int = 1024,
+        adaln_lora_dim: int = 256,
         patch_size: Tuple[int, int, int] = (1, 2, 2),
-        control_block_indices: Tuple[int, ...] = (6, 13, 20, 27),
     ):
         super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-
-        self.patch_embed = CosmosPatchEmbed(in_channels, hidden_size, patch_size, bias=False)
+        self.patch_embed = CosmosPatchEmbed(in_channels, model_channels, patch_size, bias=False)
         self.control_blocks = nn.ModuleList(
-            CosmosControlNetBlock(hidden_size) for _ in range(num_layers)
+            [
+                CosmosTransformerBlock(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    cross_attention_dim=text_embed_dim,
+                    mlp_ratio=mlp_ratio,
+                    adaln_lora_dim=adaln_lora_dim,
+                    qk_norm="rms_norm",
+                    out_bias=False,
+                    img_context=True,
+                    before_proj=(block_idx == 0),
+                    after_proj=True,
+                )
+                for block_idx in range(n_controlnet_blocks)
+            ]
         )
 
     def _expand_conditioning_scale(self, conditioning_scale: Union[float, List[float]]) -> List[float]:
@@ -61,7 +64,7 @@ def _expand_conditioning_scale(self, conditioning_scale: Union[float, List[float
         else:
             scales = [conditioning_scale] * len(self.control_blocks)
 
-        if len(scales) != len(self.control_blocks):
+        if len(scales) < len(self.control_blocks):
             logger.warning(
                 "Received %d control scales, but control network defines %d blocks. "
                 "Scales will be trimmed or repeated to match.",
@@ -75,16 +78,25 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         controlnet_cond: torch.Tensor,
-        timestep: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
         conditioning_scale: Union[float, List[float]] = 1.0,
-        return_dict: bool = True,
     ) -> List[torch.Tensor]:
-        del hidden_states, timestep, encoder_hidden_states  # not used in this minimal control path
-
         control_hidden_states = self.patch_embed(controlnet_cond)
         control_hidden_states = control_hidden_states.flatten(1, 3)
 
         scales = self._expand_conditioning_scale(conditioning_scale)
-        control_residuals = tuple(block(control_hidden_states) * scale for block, scale in zip(self.control_blocks, scales))
-        return control_residuals
+        x = hidden_states
+
+        # NOTE: args to block
+        # hidden_states: torch.Tensor,
+        # encoder_hidden_states: torch.Tensor,
+        # embedded_timestep: torch.Tensor,
+        # temb: Optional[torch.Tensor] = None,
+        # image_rotary_emb: Optional[torch.Tensor] = None,
+        # extra_pos_emb: Optional[torch.Tensor] = None,
+        # attention_mask: Optional[torch.Tensor] = None,
+        # controlnet_residual: Optional[torch.Tensor] = None,
+        result = []
+        for block, scale in zip(self.control_blocks, scales):
+            x = block(x)
+            result.append(x * scale)
+        return result
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -341,6 +341,8 @@ def __init__(
         qk_norm: str = "rms_norm",
         out_bias: bool = False,
         img_context: bool = False,
+        before_proj: bool = False,
+        after_proj: bool = False,
     ) -> None:
         super().__init__()
 
@@ -386,6 +388,13 @@ def __init__(
         self.norm3 = CosmosAdaLayerNormZero(in_features=hidden_size, hidden_features=adaln_lora_dim)
         self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu", bias=out_bias)
 
+        # NOTE: zero conv for CosmosControlNet 
+        if before_proj:
+            # TODO: check hint_dim in i4
+            self.before_proj = nn.Linear(hidden_size, hidden_size)
+        if after_proj:
+            self.after_proj = nn.Linear(hidden_size, hidden_size)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -418,7 +427,7 @@ def forward(
         hidden_states = hidden_states + gate * ff_output
 
         if controlnet_residual is not None:
-            # TODO: add control_context_scale ?
+            # NOTE: this is assumed to be scaled by the controlnet
             hidden_states += controlnet_residual
 
         return hidden_states
@@ -556,8 +565,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         controlnet_block_every_n (`int`, *optional*):
             Interval between transformer blocks that should receive control residuals (for example, `7` to inject after
             every seventh block). Required for Cosmos Transfer2.5.
-        n_controlnet_blocks (`int`, *optional*):
-            The number of control net blocks. If None provided: as many as possible will be placed respecting `controlnet_block_every_n`
         img_context_dim (`int`, *optional*):
             TODO document me
             TODO rename?
@@ -588,7 +595,6 @@ def __init__(
         crossattn_proj_in_channels: int = 1024,
         encoder_hidden_states_channels: int = 1024,
         controlnet_block_every_n: Optional[int] = None,
-        n_control_net_blocks: Optional[int] = None,
         img_context_dim: Optional[int] = None,
     ) -> None:
         super().__init__()
@@ -744,7 +750,7 @@ def forward(
             n_blocks = len(self.transformer_blocks)
             controlnet_block_index_map = {
                 block_idx: block_controlnet_hidden_states[idx]
-                for idx, block_idx in list(enumerate(range(0, n_blocks, self.config.controlnet_block_every_n)))[0:self.config.n_controlnet_blocks]
+                for idx, block_idx in list(enumerate(range(0, n_blocks, self.config.controlnet_block_every_n)))
             }
 
         # 5. Transformer blocks