update

a-r-r-o-w · a-r-r-o-w · commit 606e6b25bed9 · 2024-11-29T13:43:53.000+01:00
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
@@ -44,16 +44,20 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import LTXPipeline
-        >>> from diffusers.utils import export_to_video
+        >>> from diffusers import LTXImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
 
-        >>> pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
+        >>> pipe = LTXImageToVideoPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
 
-        >>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
+        ... )
+        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
         >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
         >>> video = pipe(
+        ...     image=image,
         ...     prompt=prompt,
         ...     negative_prompt=negative_prompt,
         ...     width=704,
diff --git a/tests/models/transformers/test_models_transformer_ltx.py b/tests/models/transformers/test_models_transformer_ltx.py
@@ -26,7 +26,7 @@
 enable_full_determinism()
 
 
-class MochiTransformerTests(ModelTesterMixin, unittest.TestCase):
+class LTXTransformerTests(ModelTesterMixin, unittest.TestCase):
     model_class = LTXTransformer3DModel
     main_input_name = "hidden_states"
     uses_custom_attn_processor = True
@@ -41,7 +41,7 @@ def dummy_input(self):
         embedding_dim = 16
         sequence_length = 16
 
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        hidden_states = torch.randn((batch_size, num_frames * height * width, num_channels)).to(torch_device)
         encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
         encoder_attention_mask = torch.ones((batch_size, sequence_length)).bool().to(torch_device)
         timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
@@ -51,15 +51,18 @@ def dummy_input(self):
             "encoder_hidden_states": encoder_hidden_states,
             "timestep": timestep,
             "encoder_attention_mask": encoder_attention_mask,
+            "num_frames": num_frames,
+            "height": height,
+            "width": width,
         }
 
     @property
     def input_shape(self):
-        return (4, 2, 16, 16)
+        return (512, 4)
 
     @property
     def output_shape(self):
-        return (4, 2, 16, 16)
+        return (512, 4)
 
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {