huggingface
diff --git a/‎.github/workflows/pr_tests_gpu.yml‎
Lines changed: 44 additions & 0 deletions b/‎.github/workflows/pr_tests_gpu.yml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/ltx_video.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/api/pipelines/ltx_video.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/research_projects/pytorch_xla/inference/flux/README.md‎
Lines changed: 8 additions & 7 deletions b/‎examples/research_projects/pytorch_xla/inference/flux/README.md‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎scripts/convert_ltx_to_diffusers.py‎
Lines changed: 89 additions & 15 deletions b/‎scripts/convert_ltx_to_diffusers.py‎
Lines changed: 89 additions & 15 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 19 additions & 11 deletions b/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 19 additions & 11 deletions
@@ -28,7 +28,51 @@ env:
   PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
 
 jobs:
+  check_code_quality:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
+
+  check_repository_consistency:
+    needs: check_code_quality
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check repo consistency
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
+          python utils/check_support_list.py
+          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+  
   setup_torch_cuda_pipeline_matrix:
+    needs: [check_code_quality, check_repository_consistency]
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
     runs-on:
       group: aws-general-8-plus
 
@@ -196,6 +196,12 @@ export_to_video(video, "ship.mp4", fps=24)
   - all
   - __call__
 
+## LTXConditionPipeline
+
+[[autodoc]] LTXConditionPipeline
+  - all
+  - __call__
+
 ## LTXPipelineOutput
 
 [[autodoc]] pipelines.ltx.pipeline_output.LTXPipelineOutput
@@ -1,8 +1,6 @@
 # Generating images using Flux and PyTorch/XLA
 
-The `flux_inference` script shows how to do image generation using Flux on TPU devices using PyTorch/XLA. It uses the pallas kernel for flash attention for faster generation.
-
-It has been tested on [Trillium](https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus) TPU versions. No other TPU types have been tested.
+The `flux_inference` script shows how to do image generation using Flux on TPU devices using PyTorch/XLA. It uses the pallas kernel for flash attention for faster generation using custom flash block sizes for better performance on [Trillium](https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus) TPU versions. No other TPU types have been tested.
 
 ## Create TPU
 
@@ -23,20 +21,23 @@ Verify that PyTorch and PyTorch/XLA were installed correctly:
 python3 -c "import torch; import torch_xla;"
 ```
 
-Install dependencies
+Clone the diffusers repo and install dependencies
 
 ```bash
+git clone https://github.com/huggingface/diffusers.git
+cd diffusers
 pip install transformers accelerate sentencepiece structlog
-pushd ../../..
 pip install .
-popd
+cd examples/research_projects/pytorch_xla/inference/flux/
 ```
 
 ## Run the inference job
 
 ### Authenticate
 
-Run the following command to authenticate your token in order to download Flux weights.
+**Gated Model**
+
+As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
 
 ```bash
 huggingface-cli login
 
@@ -74,17 +74,39 @@ def remove_keys_(key: str, state_dict: Dict[str, Any]):
     "last_scale_shift_table": "scale_shift_table",
 }
 
+VAE_095_RENAME_DICT = {
+    # decoder
+    "up_blocks.0": "mid_block",
+    "up_blocks.1": "up_blocks.0.upsamplers.0",
+    "up_blocks.2": "up_blocks.0",
+    "up_blocks.3": "up_blocks.1.upsamplers.0",
+    "up_blocks.4": "up_blocks.1",
+    "up_blocks.5": "up_blocks.2.upsamplers.0",
+    "up_blocks.6": "up_blocks.2",
+    "up_blocks.7": "up_blocks.3.upsamplers.0",
+    "up_blocks.8": "up_blocks.3",
+    # encoder
+    "down_blocks.0": "down_blocks.0",
+    "down_blocks.1": "down_blocks.0.downsamplers.0",
+    "down_blocks.2": "down_blocks.1",
+    "down_blocks.3": "down_blocks.1.downsamplers.0",
+    "down_blocks.4": "down_blocks.2",
+    "down_blocks.5": "down_blocks.2.downsamplers.0",
+    "down_blocks.6": "down_blocks.3",
+    "down_blocks.7": "down_blocks.3.downsamplers.0",
+    "down_blocks.8": "mid_block",
+    # common
+    "last_time_embedder": "time_embedder",
+    "last_scale_shift_table": "scale_shift_table",
+}
+
 VAE_SPECIAL_KEYS_REMAP = {
     "per_channel_statistics.channel": remove_keys_,
     "per_channel_statistics.mean-of-means": remove_keys_,
     "per_channel_statistics.mean-of-stds": remove_keys_,
     "model.diffusion_model": remove_keys_,
 }
 
-VAE_091_SPECIAL_KEYS_REMAP = {
-    "timestep_scale_multiplier": remove_keys_,
-}
-
 
 def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
     state_dict = saved_dict
@@ -104,12 +126,16 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key:
 def convert_transformer(
     ckpt_path: str,
     dtype: torch.dtype,
+    version: str = "0.9.0",
 ):
     PREFIX_KEY = "model.diffusion_model."
 
     original_state_dict = get_state_dict(load_file(ckpt_path))
+    config = {}
+    if version == "0.9.5":
+        config["_use_causal_rope_fix"] = True
     with init_empty_weights():
-        transformer = LTXVideoTransformer3DModel()
+        transformer = LTXVideoTransformer3DModel(**config)
 
     for key in list(original_state_dict.keys()):
         new_key = key[:]
@@ -161,12 +187,19 @@ def get_vae_config(version: str) -> Dict[str, Any]:
             "out_channels": 3,
             "latent_channels": 128,
             "block_out_channels": (128, 256, 512, 512),
+            "down_block_types": (
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+            ),
             "decoder_block_out_channels": (128, 256, 512, 512),
             "layers_per_block": (4, 3, 3, 3, 4),
             "decoder_layers_per_block": (4, 3, 3, 3, 4),
             "spatio_temporal_scaling": (True, True, True, False),
             "decoder_spatio_temporal_scaling": (True, True, True, False),
             "decoder_inject_noise": (False, False, False, False, False),
+            "downsample_type": ("conv", "conv", "conv", "conv"),
             "upsample_residual": (False, False, False, False),
             "upsample_factor": (1, 1, 1, 1),
             "patch_size": 4,
@@ -183,12 +216,19 @@ def get_vae_config(version: str) -> Dict[str, Any]:
             "out_channels": 3,
             "latent_channels": 128,
             "block_out_channels": (128, 256, 512, 512),
+            "down_block_types": (
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+                "LTXVideoDownBlock3D",
+            ),
             "decoder_block_out_channels": (256, 512, 1024),
             "layers_per_block": (4, 3, 3, 3, 4),
             "decoder_layers_per_block": (5, 6, 7, 8),
             "spatio_temporal_scaling": (True, True, True, False),
             "decoder_spatio_temporal_scaling": (True, True, True),
             "decoder_inject_noise": (True, True, True, False),
+            "downsample_type": ("conv", "conv", "conv", "conv"),
             "upsample_residual": (True, True, True),
             "upsample_factor": (2, 2, 2),
             "timestep_conditioning": True,
@@ -200,7 +240,38 @@ def get_vae_config(version: str) -> Dict[str, Any]:
             "decoder_causal": False,
         }
         VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
-        VAE_SPECIAL_KEYS_REMAP.update(VAE_091_SPECIAL_KEYS_REMAP)
+    elif version == "0.9.5":
+        config = {
+            "in_channels": 3,
+            "out_channels": 3,
+            "latent_channels": 128,
+            "block_out_channels": (128, 256, 512, 1024, 2048),
+            "down_block_types": (
+                "LTXVideo095DownBlock3D",
+                "LTXVideo095DownBlock3D",
+                "LTXVideo095DownBlock3D",
+                "LTXVideo095DownBlock3D",
+            ),
+            "decoder_block_out_channels": (256, 512, 1024),
+            "layers_per_block": (4, 6, 6, 2, 2),
+            "decoder_layers_per_block": (5, 5, 5, 5),
+            "spatio_temporal_scaling": (True, True, True, True),
+            "decoder_spatio_temporal_scaling": (True, True, True),
+            "decoder_inject_noise": (False, False, False, False),
+            "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
+            "upsample_residual": (True, True, True),
+            "upsample_factor": (2, 2, 2),
+            "timestep_conditioning": True,
+            "patch_size": 4,
+            "patch_size_t": 1,
+            "resnet_norm_eps": 1e-6,
+            "scaling_factor": 1.0,
+            "encoder_causal": True,
+            "decoder_causal": False,
+            "spatial_compression_ratio": 32,
+            "temporal_compression_ratio": 8,
+        }
+        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
     return config
 
 
@@ -223,7 +294,7 @@ def get_args():
     parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
     parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
     parser.add_argument(
-        "--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1"], help="Version of the LTX model"
+        "--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1", "0.9.5"], help="Version of the LTX model"
     )
     return parser.parse_args()
 
@@ -277,14 +348,17 @@ def get_args():
         for param in text_encoder.parameters():
             param.data = param.data.contiguous()
 
-        scheduler = FlowMatchEulerDiscreteScheduler(
-            use_dynamic_shifting=True,
-            base_shift=0.95,
-            max_shift=2.05,
-            base_image_seq_len=1024,
-            max_image_seq_len=4096,
-            shift_terminal=0.1,
-        )
+        if args.version == "0.9.5":
+            scheduler = FlowMatchEulerDiscreteScheduler(use_dynamic_shifting=False)
+        else:
+            scheduler = FlowMatchEulerDiscreteScheduler(
+                use_dynamic_shifting=True,
+                base_shift=0.95,
+                max_shift=2.05,
+                base_image_seq_len=1024,
+                max_image_seq_len=4096,
+                shift_terminal=0.1,
+            )
 
         pipe = LTXPipeline(
             scheduler=scheduler,
 
@@ -402,6 +402,7 @@
             "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
+            "LTXConditionPipeline",
             "LTXImageToVideoPipeline",
             "LTXPipeline",
             "Lumina2Pipeline",
@@ -947,6 +948,7 @@
             LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
+            LTXConditionPipeline,
             LTXImageToVideoPipeline,
             LTXPipeline,
             Lumina2Pipeline,
 
@@ -98,12 +98,13 @@ def onload_(self):
                     buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
                     if self.record_stream:
                         buffer.data.record_stream(current_stream)
-            
+
             if self.parameters is not None:
                 for param in self.parameters:
                     param.data = param.data.to(self.onload_device, non_blocking=self.non_blocking)
                     if self.record_stream:
                         param.data.record_stream(current_stream)
+
             if self.buffers is not None:
                 for buffer in self.buffers:
                     buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
@@ -198,6 +199,13 @@ def __init__(self):
         self._layer_execution_tracker_module_names = set()
 
     def initialize_hook(self, module):
+        def make_execution_order_update_callback(current_name, current_submodule):
+            def callback():
+                logger.debug(f"Adding {current_name} to the execution order")
+                self.execution_order.append((current_name, current_submodule))
+
+            return callback
+
         # To every submodule that contains a group offloading hook (at this point, no prefetching is enabled for any
         # of the groups), we add a layer execution tracker hook that will be used to determine the order in which the
         # layers are executed during the forward pass.
@@ -209,14 +217,8 @@ def initialize_hook(self, module):
             group_offloading_hook = registry.get_hook(_GROUP_OFFLOADING)
 
             if group_offloading_hook is not None:
-
-                def make_execution_order_update_callback(current_name, current_submodule):
-                    def callback():
-                        logger.debug(f"Adding {current_name} to the execution order")
-                        self.execution_order.append((current_name, current_submodule))
-
-                    return callback
-
+                # For the first forward pass, we have to load in a blocking manner
+                group_offloading_hook.group.non_blocking = False
                 layer_tracker_hook = LayerExecutionTrackerHook(make_execution_order_update_callback(name, submodule))
                 registry.register_hook(layer_tracker_hook, _LAYER_EXECUTION_TRACKER)
                 self._layer_execution_tracker_module_names.add(name)
@@ -246,15 +248,21 @@ def post_forward(self, module, output):
         # Remove the layer execution tracker hooks from the submodules
         base_module_registry = module._diffusers_hook
         registries = [submodule._diffusers_hook for _, submodule in self.execution_order]
+        group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
 
         for i in range(num_executed):
             registries[i].remove_hook(_LAYER_EXECUTION_TRACKER, recurse=False)
 
         # Remove the current lazy prefetch group offloading hook so that it doesn't interfere with the next forward pass
         base_module_registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=False)
 
-        # Apply lazy prefetching by setting required attributes
-        group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
+        # LazyPrefetchGroupOffloadingHook is only used with streams, so we know that non_blocking should be True.
+        # We disable non_blocking for the first forward pass, but need to enable it for the subsequent passes to
+        # see the benefits of prefetching.
+        for hook in group_offloading_hooks:
+            hook.group.non_blocking = True
+
+        # Set required attributes for prefetching
         if num_executed > 0:
             base_module_group_offloading_hook = base_module_registry.get_hook(_GROUP_OFFLOADING)
             base_module_group_offloading_hook.next_group = group_offloading_hooks[0].group