huggingface
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 0 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/ssh-pr-runner.yml‎
Lines changed: 39 additions & 0 deletions b/‎.github/workflows/ssh-pr-runner.yml‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.github/workflows/ssh-runner.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ssh-runner.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/diffusers-pytorch-compile-cuda/Dockerfile‎
Lines changed: 7 additions & 7 deletions b/‎docker/diffusers-pytorch-compile-cuda/Dockerfile‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docker/diffusers-pytorch-cpu/Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎docker/diffusers-pytorch-cpu/Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/diffusers-pytorch-cuda/Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎docker/diffusers-pytorch-cuda/Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/diffusers-pytorch-xformers-cuda/Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎docker/diffusers-pytorch-xformers-cuda/Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/latte.md‎
Lines changed: 75 additions & 0 deletions b/‎docs/source/en/api/pipelines/latte.md‎
Lines changed: 75 additions & 0 deletions
@@ -73,7 +73,7 @@ body:
         - ControlNet @sayakpaul @yiyixuxu @DN6
         - T2I Adapter @sayakpaul @yiyixuxu @DN6
         - IF @DN6
-        - Text-to-Video / Video-to-Video @DN6 @sayakpaul
+        - Text-to-Video / Video-to-Video @DN6 @a-r-r-o-w
         - Wuerstchen @DN6
         - Other: @yiyixuxu @DN6
         - Improving generation quality: @asomoza
 
@@ -49,6 +49,7 @@ Core library:
 Integrations:
 
 - deepspeed: HF Trainer/Accelerate: @SunMarc
+- PEFT: @sayakpaul @BenjaminBossan
 
 HF projects:
 
 
@@ -0,0 +1,39 @@
+name: SSH into PR runners
+
+on:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+
+env:
+  IS_GITHUB_CI: "1"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+
+jobs:
+  ssh_runner:
+    name: "SSH"
+    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        uses: huggingface/tailscale-action@main
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true
@@ -1,4 +1,4 @@
-name: SSH into runners
+name: SSH into GPU runners
 
 on:
   workflow_dispatch:
 
@@ -16,24 +16,24 @@ RUN apt install -y bash \
     ca-certificates \
     libsndfile1-dev \
     libgl1 \
-    python3.9 \
-    python3.9-dev \
+    python3.10 \
+    python3.10-dev \
     python3-pip \
-    python3.9-venv && \
+    python3.10-venv && \
     rm -rf /var/lib/apt/lists
 
 # make sure to use venv
-RUN python3.9 -m venv /opt/venv
+RUN python3.10 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.9 -m uv pip install --no-cache-dir \
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
     torch \
     torchvision \
     torchaudio \
     invisible_watermark && \
-    python3.9 -m pip install --no-cache-dir \
+    python3.10 -m pip install --no-cache-dir \
     accelerate \
     datasets \
     hf-doc-builder \
 
@@ -16,6 +16,7 @@ RUN apt install -y bash \
                    ca-certificates \
                    libsndfile1-dev \
                    python3.10 \
+                   python3.10-dev \
                    python3-pip \
                    libgl1 \
                    python3.10-venv && \
 
@@ -17,6 +17,7 @@ RUN apt install -y bash \
     libsndfile1-dev \
     libgl1 \
     python3.10 \
+    python3.10-dev \
     python3-pip \
     python3.10-venv && \
     rm -rf /var/lib/apt/lists
 
@@ -17,6 +17,7 @@ RUN apt install -y bash \
                    libsndfile1-dev \
                    libgl1 \
                    python3.10 \
+                   python3.10-dev \
                    python3-pip \
                    python3.10-venv && \
     rm -rf /var/lib/apt/lists
 
@@ -332,6 +332,8 @@
       title: Latent Consistency Models
     - local: api/pipelines/latent_diffusion
       title: Latent Diffusion
+    - local: api/pipelines/latte
+      title: Latte
     - local: api/pipelines/ledits_pp
       title: LEDITS++
     - local: api/pipelines/lumina
 
@@ -0,0 +1,75 @@
+<!-- # Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# Latte
+
+![latte text-to-video](https://github.com/Vchitect/Latte/blob/52bc0029899babbd6e9250384c83d8ed2670ff7a/visuals/latte.gif?raw=true)
+
+[Latte: Latent Diffusion Transformer for Video Generation](https://arxiv.org/abs/2401.03048) from Monash University, Shanghai AI Lab, Nanjing University, and Nanyang Technological University.
+
+The abstract from the paper is:
+
+*We propose a novel Latent Diffusion Transformer, namely Latte, for video generation. Latte first extracts spatio-temporal tokens from input videos and then adopts a series of Transformer blocks to model video distribution in the latent space. In order to model a substantial number of tokens extracted from videos, four efficient variants are introduced from the perspective of decomposing the spatial and temporal dimensions of input videos. To improve the quality of generated videos, we determine the best practices of Latte through rigorous experimental analysis, including video clip patch embedding, model variants, timestep-class information injection, temporal positional embedding, and learning strategies. Our comprehensive evaluation demonstrates that Latte achieves state-of-the-art performance across four standard video generation datasets, i.e., FaceForensics, SkyTimelapse, UCF101, and Taichi-HD. In addition, we extend Latte to text-to-video generation (T2V) task, where Latte achieves comparable results compared to recent T2V models. We strongly believe that Latte provides valuable insights for future research on incorporating Transformers into diffusion models for video generation.*
+
+**Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+### Inference
+
+Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
+
+First, load the pipeline:
+
+```python
+import torch
+from diffusers import LattePipeline
+
+pipeline = LattePipeline.from_pretrained(
+	"maxin-cn/Latte-1", torch_dtype=torch.float16
+).to("cuda")
+```
+
+Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
+
+```python
+pipeline.transformer.to(memory_format=torch.channels_last)
+pipeline.vae.to(memory_format=torch.channels_last)
+```
+
+Finally, compile the components and run inference:
+
+```python
+pipeline.transformer = torch.compile(pipeline.transformer)
+pipeline.vae.decode = torch.compile(pipeline.vae.decode)
+
+video = pipeline(prompt="A dog wearing sunglasses floating in space, surreal, nebulae in background").frames[0]
+```
+
+The [benchmark](https://gist.github.com/a-r-r-o-w/4e1694ca46374793c0361d740a99ff19) results on an 80GB A100 machine are:
+
+```
+Without torch.compile(): Average inference time: 16.246 seconds.
+With torch.compile(): Average inference time: 14.573 seconds.
+```
+
+## LattePipeline
+
+[[autodoc]] LattePipeline
+  - all
+  - __call__
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: SSH into runners`
	`1`	`+name: SSH into GPU runners`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`workflow_dispatch:`