From de30cbadf24a6b2ee5a6476783d3e765714c38ae Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 17 Mar 2025 14:03:24 +0530
Subject: [PATCH 1/6] test for better torch.compile stuff.

---
 tests/pipelines/test_pipelines_common.py | 41 +++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index a98de5c9eaf9..0a9e6791b72c 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -13,6 +13,7 @@
 import torch.nn as nn
 from huggingface_hub import ModelCard, delete_repo
 from huggingface_hub.utils import is_jinja_available
+from torch._dynamo.utils import counters
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 import diffusers
@@ -45,6 +46,7 @@
 from diffusers.utils.source_code_parsing_utils import ReturnNameVisitor
 from diffusers.utils.testing_utils import (
     CaptureLogger,
+    backend_empty_cache,
     require_accelerate_version_greater,
     require_accelerator,
     require_hf_hub_version_greater,
@@ -52,6 +54,7 @@
     require_torch_gpu,
     require_transformers_version_greater,
     skip_mps,
+    slow,
     torch_device,
 )
 
@@ -1113,8 +1116,9 @@ def setUp(self):
     def tearDown(self):
         # clean up the VRAM after each test in case of CUDA runtime errors
         super().tearDown()
+        torch._dynamo.reset()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_save_load_local(self, expected_max_difference=5e-4):
         components = self.get_dummy_components()
@@ -2153,6 +2157,41 @@ def test_StableDiffusionMixin_component(self):
             )
         )
 
+    @require_torch_gpu
+    @slow
+    def test_torch_compile_recompilation(self):
+        inputs = self.get_dummy_inputs()
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components).to(torch_device)
+        if getattr(pipe, "unet", None) is None:
+            pipe.unet = torch.compile(pipe.unet, fullgraph=True)
+        else:
+            pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True):
+            _ = pipe(**inputs)
+
+    @require_torch_gpu
+    @slow
+    def test_torch_compile_graph_breaks(self):
+        # Inspired by:
+        # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138
+        counters.clear()
+
+        inputs = self.get_dummy_inputs()
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components).to(torch_device)
+        if getattr(pipe, "unet", None) is None:
+            pipe.unet = torch.compile(pipe.unet, fullgraph=True)
+        else:
+            pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)
+
+        _ = pipe(**inputs)
+        num_graph_breaks = len(counters["graph_break"].keys())
+        self.assertEqual(num_graph_breaks, 0)
+
     @require_hf_hub_version_greater("0.26.5")
     @require_transformers_version_greater("4.47.1")
     def test_save_load_dduf(self, atol=1e-4, rtol=1e-4):

From f389a4d5eb933249b7fc5349d081af3b3adf6ecc Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 17 Mar 2025 14:15:31 +0530
Subject: [PATCH 2/6] fixes

---
 tests/pipelines/test_pipelines_common.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 0a9e6791b72c..cf3eab43aa9b 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -2160,11 +2160,11 @@ def test_StableDiffusionMixin_component(self):
     @require_torch_gpu
     @slow
     def test_torch_compile_recompilation(self):
-        inputs = self.get_dummy_inputs()
+        inputs = self.get_dummy_inputs(torch_device)
         components = self.get_dummy_components()
 
         pipe = self.pipeline_class(**components).to(torch_device)
-        if getattr(pipe, "unet", None) is None:
+        if getattr(pipe, "unet", None) is not None:
             pipe.unet = torch.compile(pipe.unet, fullgraph=True)
         else:
             pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)
@@ -2179,11 +2179,11 @@ def test_torch_compile_graph_breaks(self):
         # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138
         counters.clear()
 
-        inputs = self.get_dummy_inputs()
+        inputs = self.get_dummy_inputs(torch_device)
         components = self.get_dummy_components()
 
         pipe = self.pipeline_class(**components).to(torch_device)
-        if getattr(pipe, "unet", None) is None:
+        if getattr(pipe, "unet", None) is not None:
             pipe.unet = torch.compile(pipe.unet, fullgraph=True)
         else:
             pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)

From 6791037c6451409de879b3fa3c73dc2b04d94100 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 21 Mar 2025 08:53:04 +0530
Subject: [PATCH 3/6] recompilation and graph break.

---
 tests/pipelines/test_pipelines_common.py | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index cf3eab43aa9b..f51048f150d9 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -13,7 +13,6 @@
 import torch.nn as nn
 from huggingface_hub import ModelCard, delete_repo
 from huggingface_hub.utils import is_jinja_available
-from torch._dynamo.utils import counters
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 import diffusers
@@ -2159,7 +2158,7 @@ def test_StableDiffusionMixin_component(self):
 
     @require_torch_gpu
     @slow
-    def test_torch_compile_recompilation(self):
+    def test_torch_compile_recompilation_and_graph_break(self):
         inputs = self.get_dummy_inputs(torch_device)
         components = self.get_dummy_components()
 
@@ -2172,26 +2171,6 @@ def test_torch_compile_recompilation(self):
         with torch._dynamo.config.patch(error_on_recompile=True):
             _ = pipe(**inputs)
 
-    @require_torch_gpu
-    @slow
-    def test_torch_compile_graph_breaks(self):
-        # Inspired by:
-        # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138
-        counters.clear()
-
-        inputs = self.get_dummy_inputs(torch_device)
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components).to(torch_device)
-        if getattr(pipe, "unet", None) is not None:
-            pipe.unet = torch.compile(pipe.unet, fullgraph=True)
-        else:
-            pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)
-
-        _ = pipe(**inputs)
-        num_graph_breaks = len(counters["graph_break"].keys())
-        self.assertEqual(num_graph_breaks, 0)
-
     @require_hf_hub_version_greater("0.26.5")
     @require_transformers_version_greater("4.47.1")
     def test_save_load_dduf(self, atol=1e-4, rtol=1e-4):

From c7f153a2a23c69322039dec77f322ade3ec9afa3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 14 Apr 2025 13:28:30 +0530
Subject: [PATCH 4/6] clear compilation cache.

---
 tests/pipelines/test_pipelines_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 71b67c6f5d30..eeb245b485de 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1112,6 +1112,7 @@ def callback_cfg_params(self) -> frozenset:
     def setUp(self):
         # clean up the VRAM before each test
         super().setUp()
+        torch._dynamo.reset()
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -2167,6 +2168,7 @@ def test_StableDiffusionMixin_component(self):
     @require_torch_gpu
     @slow
     def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
         inputs = self.get_dummy_inputs(torch_device)
         components = self.get_dummy_components()
 

From e0566e6890660d8523f32b310ca031e7a04fdfe0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 14 Apr 2025 17:29:54 +0530
Subject: [PATCH 5/6] change to modeling level test.

---
 tests/models/test_modeling_common.py          | 29 +++++++++++++++++++
 .../test_models_transformer_flux.py           |  4 +--
 tests/pipelines/test_pipelines_common.py      | 17 -----------
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index f82a2407f333..493405219c4e 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1714,6 +1714,35 @@ def test_push_to_hub_library_name(self):
         delete_repo(self.repo_id, token=TOKEN)
 
 
+class TorchCompileTesterMixin:
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        torch._dynamo.reset()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        # clean up the VRAM after each test in case of CUDA runtime errors
+        super().tearDown()
+        torch._dynamo.reset()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @require_torch_gpu
+    @require_torch_2
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+
+
 @slow
 @require_torch_2
 @require_torch_accelerator
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py
index c88b3dac8216..f767d2196e7c 100644
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -22,7 +22,7 @@
 from diffusers.models.embeddings import ImageProjection
 from diffusers.utils.testing_utils import enable_full_determinism, torch_device
 
-from ..test_modeling_common import ModelTesterMixin
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
 
 
 enable_full_determinism()
@@ -78,7 +78,7 @@ def create_flux_ip_adapter_state_dict(model):
     return ip_state_dict
 
 
-class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
+class FluxTransformerTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
     model_class = FluxTransformer2DModel
     main_input_name = "hidden_states"
     # We override the items here because the transformer under consideration is small.
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index eeb245b485de..eb420d1d2f12 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -56,7 +56,6 @@
     require_torch_gpu,
     require_transformers_version_greater,
     skip_mps,
-    slow,
     torch_device,
 )
 
@@ -2165,22 +2164,6 @@ def test_StableDiffusionMixin_component(self):
             )
         )
 
-    @require_torch_gpu
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        inputs = self.get_dummy_inputs(torch_device)
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components).to(torch_device)
-        if getattr(pipe, "unet", None) is not None:
-            pipe.unet = torch.compile(pipe.unet, fullgraph=True)
-        else:
-            pipe.transformer = torch.compile(pipe.transformer, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            _ = pipe(**inputs)
-
     @require_hf_hub_version_greater("0.26.5")
     @require_transformers_version_greater("4.47.1")
     def test_save_load_dduf(self, atol=1e-4, rtol=1e-4):

From 87d957dace94c354eaf1deb80e4ed8160f0a8b9e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 15 Apr 2025 07:42:28 +0530
Subject: [PATCH 6/6] allow running compilation tests during nightlies.

---
 .github/workflows/nightly_tests.yml      | 49 ++++++++++++++++++++++++
 .github/workflows/release_tests_fast.yml |  2 +-
 tests/models/test_modeling_common.py     |  2 +
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index 88343a128bb1..7696852ecd44 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -180,6 +180,55 @@ jobs:
         pip install slack_sdk tabulate
         python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run torch compile tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        RUN_COMPILE: yes
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+  
   run_big_gpu_torch_tests:
     name: Torch tests on big GPU
     strategy:
diff --git a/.github/workflows/release_tests_fast.yml b/.github/workflows/release_tests_fast.yml
index 27bd9bd9bb42..9d65db2f0dee 100644
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -335,7 +335,7 @@ jobs:
     - name: Environment
       run: |
         python utils/print_env.py
-    - name: Run example tests on GPU
+    - name: Run torch compile tests on GPU
       env:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
         RUN_COMPILE: yes
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 493405219c4e..a7a42368a84d 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1731,6 +1731,7 @@ def tearDown(self):
 
     @require_torch_gpu
     @require_torch_2
+    @is_torch_compile
     @slow
     def test_torch_compile_recompilation_and_graph_break(self):
         torch._dynamo.reset()
@@ -1741,6 +1742,7 @@ def test_torch_compile_recompilation_and_graph_break(self):
 
         with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
             _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
 
 
 @slow