support video-to-world

a-r-r-o-w · a-r-r-o-w · commit 3e019f2a407c · 2025-06-10T00:16:49.000+02:00
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -182,7 +182,7 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
         "adaln_lora_dim": 256,
         "max_size": (128, 240, 240),
         "patch_size": (1, 2, 2),
-        "rope_scale": (1.0, 1.0, 1.0),
+        "rope_scale": (1.0, 4.0, 4.0),
         "concat_padding_mask": True,
         "extra_pos_embed_type": None,
     },
@@ -197,7 +197,7 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
         "adaln_lora_dim": 256,
         "max_size": (128, 240, 240),
         "patch_size": (1, 2, 2),
-        "rope_scale": (20 / 24, 2.0, 2.0),
+        "rope_scale": (1.0, 4.0, 4.0),
         "concat_padding_mask": True,
         "extra_pos_embed_type": None,
     },
@@ -212,7 +212,7 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
         "adaln_lora_dim": 256,
         "max_size": (128, 240, 240),
         "patch_size": (1, 2, 2),
-        "rope_scale": (1.0, 1.0, 1.0),
+        "rope_scale": (1.0, 3.0, 3.0),
         "concat_padding_mask": True,
         "extra_pos_embed_type": None,
     },
@@ -427,7 +427,7 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
     tokenizer = T5TokenizerFast.from_pretrained(args.tokenizer_path)
 
     scheduler = EDMEulerScheduler(
-        sigma_min=0.0002,
+        sigma_min=0.002,
         sigma_max=80,
         sigma_data=1.0,
         sigma_schedule="karras",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -361,6 +361,7 @@
             "CogView4ControlPipeline",
             "CogView4Pipeline",
             "ConsisIDPipeline",
+            "Cosmos2VideoToWorldPipeline",
             "CosmosTextToImagePipeline",
             "CosmosTextToWorldPipeline",
             "CosmosVideoToWorldPipeline",
@@ -950,6 +951,7 @@
             CogView4ControlPipeline,
             CogView4Pipeline,
             ConsisIDPipeline,
+            Cosmos2VideoToWorldPipeline,
             CosmosTextToImagePipeline,
             CosmosTextToWorldPipeline,
             CosmosVideoToWorldPipeline,
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -100,11 +100,15 @@ def forward(
         embedded_timestep = self.linear_2(embedded_timestep)
 
         if temb is not None:
-            embedded_timestep = embedded_timestep + temb[:, : 2 * self.embedding_dim]
+            embedded_timestep = embedded_timestep + temb[..., : 2 * self.embedding_dim]
 
-        shift, scale = embedded_timestep.chunk(2, dim=1)
+        shift, scale = embedded_timestep.chunk(2, dim=-1)
         hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+        if embedded_timestep.ndim == 2:
+            shift, scale = (x.unsqueeze(1) for x in (shift, scale))
+
+        hidden_states = hidden_states * (1 + scale) + shift
         return hidden_states
 
 
@@ -135,9 +139,13 @@ def forward(
         if temb is not None:
             embedded_timestep = embedded_timestep + temb
 
-        shift, scale, gate = embedded_timestep.chunk(3, dim=1)
+        shift, scale, gate = embedded_timestep.chunk(3, dim=-1)
         hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+        if embedded_timestep.ndim == 2:
+            shift, scale, gate = (x.unsqueeze(1) for x in (shift, scale, gate))
+
+        hidden_states = hidden_states * (1 + scale) + shift
         return hidden_states, gate
 
 
@@ -255,19 +263,19 @@ def forward(
         # 1. Self Attention
         norm_hidden_states, gate = self.norm1(hidden_states, embedded_timestep, temb)
         attn_output = self.attn1(norm_hidden_states, image_rotary_emb=image_rotary_emb)
-        hidden_states = hidden_states + gate.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + gate * attn_output
 
         # 2. Cross Attention
         norm_hidden_states, gate = self.norm2(hidden_states, embedded_timestep, temb)
         attn_output = self.attn2(
             norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
         )
-        hidden_states = hidden_states + gate.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + gate * attn_output
 
         # 3. Feed Forward
         norm_hidden_states, gate = self.norm3(hidden_states, embedded_timestep, temb)
         ff_output = self.ff(norm_hidden_states)
-        hidden_states = hidden_states + gate.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + gate * ff_output
 
         return hidden_states
 
@@ -513,7 +521,23 @@ def forward(
         hidden_states = hidden_states.flatten(1, 3)  # [B, T, H, W, C] -> [B, THW, C]
 
         # 4. Timestep embeddings
-        temb, embedded_timestep = self.time_embed(hidden_states, timestep)
+        if timestep.ndim == 1:
+            temb, embedded_timestep = self.time_embed(hidden_states, timestep)
+        elif timestep.ndim == 5:
+            assert timestep.shape == (batch_size, 1, num_frames, 1, 1), (
+                f"Expected timestep to have shape [B, 1, T, 1, 1], but got {timestep.shape}"
+            )
+            timestep = timestep.flatten()
+            temb, embedded_timestep = self.time_embed(hidden_states, timestep)
+            # We can do this because num_frames == post_patch_num_frames, as p_t is 1
+            temb, embedded_timestep = (
+                x.view(batch_size, post_patch_num_frames, 1, 1, -1)
+                .expand(-1, -1, post_patch_height, post_patch_width, -1)
+                .flatten(1, 3)
+                for x in (temb, embedded_timestep)
+            )  # [BT, C] -> [B, T, 1, 1, C] -> [B, T, H, W, C] -> [B, THW, C]
+        else:
+            assert False
 
         # 5. Transformer blocks
         for block in self.transformer_blocks:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -161,6 +161,7 @@
         "CosmosTextToImagePipeline",
         "CosmosTextToWorldPipeline",
         "CosmosVideoToWorldPipeline",
+        "Cosmos2VideoToWorldPipeline",
     ]
     _import_structure["controlnet"].extend(
         [
@@ -563,7 +564,12 @@
             StableDiffusionControlNetXSPipeline,
             StableDiffusionXLControlNetXSPipeline,
         )
-        from .cosmos import CosmosTextToImagePipeline, CosmosTextToWorldPipeline, CosmosVideoToWorldPipeline
+        from .cosmos import (
+            Cosmos2VideoToWorldPipeline,
+            CosmosTextToImagePipeline,
+            CosmosTextToWorldPipeline,
+            CosmosVideoToWorldPipeline,
+        )
         from .deepfloyd_if import (
             IFImg2ImgPipeline,
             IFImg2ImgSuperResolutionPipeline,
diff --git a/src/diffusers/pipelines/cosmos/__init__.py b/src/diffusers/pipelines/cosmos/__init__.py
@@ -22,6 +22,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
     _import_structure["pipeline_cosmos_text2image"] = ["CosmosTextToImagePipeline"]
     _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
     _import_structure["pipeline_cosmos_video2world"] = ["CosmosVideoToWorldPipeline"]
@@ -34,6 +35,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
+        from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
         from .pipeline_cosmos_text2image import CosmosTextToImagePipeline
         from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
         from .pipeline_cosmos_video2world import CosmosVideoToWorldPipeline
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py