huggingface
diff --git a/‎src/diffusers/pipelines/cosmos/pipeline_cosmos.py‎
Lines changed: 4 additions & 2 deletions b/‎src/diffusers/pipelines/cosmos/pipeline_cosmos.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py‎
Lines changed: 38 additions & 9 deletions b/‎src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎tests/models/transformers/test_models_transformer_cosmos.py‎
Lines changed: 65 additions & 0 deletions b/‎tests/models/transformers/test_models_transformer_cosmos.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎tests/pipelines/cosmos/__init__.py‎ b/‎tests/pipelines/cosmos/__init__.py‎
@@ -47,7 +47,6 @@
 
         >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"
         >>> pipe = CosmosPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.vae.enable_tiling()
         >>> pipe.to("cuda")
 
         >>> prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
@@ -540,6 +539,8 @@ def __call__(
                     padding_mask=padding_mask,
                     return_dict=False,
                 )[0]
+
+                sample = latents
                 if self.do_classifier_free_guidance:
                     noise_pred_uncond = self.transformer(
                         hidden_states=latent_model_input,
@@ -550,9 +551,10 @@ def __call__(
                         return_dict=False,
                     )[0]
                     noise_pred = torch.cat([noise_pred_uncond, noise_pred])
+                    sample = torch.cat([sample, sample])
 
                 # pred_original_sample (x0)
-                noise_pred = self.scheduler.step(noise_pred, t, latents, return_dict=False)[1]
+                noise_pred = self.scheduler.step(noise_pred, t, sample, return_dict=False)[1]
                 self.scheduler._step_index -= 1
 
                 if self.do_classifier_free_guidance:
 
@@ -41,20 +41,47 @@
 
 EXAMPLE_DOC_STRING = """
     Examples:
+        Image conditioning:
+
+        ```python
+        >>> import torch
+        >>> from diffusers import CosmosVideoToWorldPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+
+        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
+        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day."
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
+        ... )
+
+        >>> video = pipe(image=image, prompt=prompt).frames[0]
+        >>> export_to_video(video, "output.mp4", fps=30)
+        ```
+
+        Video conditioning:
+
         ```python
         >>> import torch
-        >>> from diffusers import CosmosPipeline
-        >>> from diffusers.utils import export_to_video
+        >>> from diffusers import CosmosVideoToWorldPipeline
+        >>> from diffusers.utils import export_to_video, load_video
 
-        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"
-        >>> pipe = CosmosPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.vae.enable_tiling()
+        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
+        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe.transformer = torch.compile(pipe.transformer)
         >>> pipe.to("cuda")
 
-        >>> prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
+        >>> prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
+        >>> video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
+        ... )[
+        ...     :21
+        ... ]  # This example uses only the first 21 frames
 
-        >>> output = pipe(prompt=prompt).frames[0]
-        >>> export_to_video(output, "output.mp4", fps=30)
+        >>> video = pipe(video=video, prompt=prompt).frames[0]
+        >>> export_to_video(video, "output.mp4", fps=30)
         ```
 """
 
@@ -654,6 +681,7 @@ def __call__(
                     return_dict=False,
                 )[0]
 
+                sample = latents
                 if self.do_classifier_free_guidance:
                     current_uncond_indicator = uncond_indicator * 0 if is_augment_sigma_greater else uncond_indicator
                     uncond_noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=torch.float32)
@@ -673,9 +701,10 @@ def __call__(
                         return_dict=False,
                     )[0]
                     noise_pred = torch.cat([noise_pred_uncond, noise_pred])
+                    sample = torch.cat([sample, sample])
 
                 # pred_original_sample (x0)
-                noise_pred = self.scheduler.step(noise_pred, t, latents, return_dict=False)[1]
+                noise_pred = self.scheduler.step(noise_pred, t, sample, return_dict=False)[1]
                 self.scheduler._step_index -= 1
 
                 if self.do_classifier_free_guidance:
 
@@ -86,3 +86,68 @@ def prepare_init_args_and_inputs_for_common(self):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CosmosTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class CosmosTransformer3DModelVideoToWorldTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CosmosTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 4
+        num_frames = 1
+        height = 16
+        width = 16
+        text_embed_dim = 16
+        sequence_length = 12
+        fps = 30
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_embed_dim)).to(torch_device)
+        attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+        condition_mask = torch.ones(batch_size, 1, num_frames, height, width).to(torch_device)
+        padding_mask = torch.zeros(batch_size, 1, height, width).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "attention_mask": attention_mask,
+            "fps": fps,
+            "condition_mask": condition_mask,
+            "padding_mask": padding_mask,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 4 + 1,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "num_layers": 2,
+            "mlp_ratio": 2,
+            "text_embed_dim": 16,
+            "adaln_lora_dim": 4,
+            "max_size": (4, 32, 32),
+            "patch_size": (1, 2, 2),
+            "rope_scale": (2.0, 1.0, 1.0),
+            "concat_padding_mask": True,
+            "extra_pos_embed_type": "learnable",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"CosmosTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)