From de30cbadf24a6b2ee5a6476783d3e765714c38ae Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 17 Mar 2025 14:03:24 +0530 Subject: [PATCH 1/6] test for better torch.compile stuff. --- tests/pipelines/test_pipelines_common.py | 41 +++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index a98de5c9eaf9..0a9e6791b72c 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -13,6 +13,7 @@ import torch.nn as nn from huggingface_hub import ModelCard, delete_repo from huggingface_hub.utils import is_jinja_available +from torch._dynamo.utils import counters from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer import diffusers @@ -45,6 +46,7 @@ from diffusers.utils.source_code_parsing_utils import ReturnNameVisitor from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, require_accelerate_version_greater, require_accelerator, require_hf_hub_version_greater, @@ -52,6 +54,7 @@ require_torch_gpu, require_transformers_version_greater, skip_mps, + slow, torch_device, ) @@ -1113,8 +1116,9 @@ def setUp(self): def tearDown(self): # clean up the VRAM after each test in case of CUDA runtime errors super().tearDown() + torch._dynamo.reset() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_save_load_local(self, expected_max_difference=5e-4): components = self.get_dummy_components() @@ -2153,6 +2157,41 @@ def test_StableDiffusionMixin_component(self): ) ) + @require_torch_gpu + @slow + def test_torch_compile_recompilation(self): + inputs = self.get_dummy_inputs() + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components).to(torch_device) + if getattr(pipe, "unet", None) is None: + pipe.unet = torch.compile(pipe.unet, fullgraph=True) + else: + pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True): + _ = pipe(**inputs) + + @require_torch_gpu + @slow + def test_torch_compile_graph_breaks(self): + # Inspired by: + # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138 + counters.clear() + + inputs = self.get_dummy_inputs() + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components).to(torch_device) + if getattr(pipe, "unet", None) is None: + pipe.unet = torch.compile(pipe.unet, fullgraph=True) + else: + pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) + + _ = pipe(**inputs) + num_graph_breaks = len(counters["graph_break"].keys()) + self.assertEqual(num_graph_breaks, 0) + @require_hf_hub_version_greater("0.26.5") @require_transformers_version_greater("4.47.1") def test_save_load_dduf(self, atol=1e-4, rtol=1e-4): From f389a4d5eb933249b7fc5349d081af3b3adf6ecc Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 17 Mar 2025 14:15:31 +0530 Subject: [PATCH 2/6] fixes --- tests/pipelines/test_pipelines_common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 0a9e6791b72c..cf3eab43aa9b 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -2160,11 +2160,11 @@ def test_StableDiffusionMixin_component(self): @require_torch_gpu @slow def test_torch_compile_recompilation(self): - inputs = self.get_dummy_inputs() + inputs = self.get_dummy_inputs(torch_device) components = self.get_dummy_components() pipe = self.pipeline_class(**components).to(torch_device) - if getattr(pipe, "unet", None) is None: + if getattr(pipe, "unet", None) is not None: pipe.unet = torch.compile(pipe.unet, fullgraph=True) else: pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) @@ -2179,11 +2179,11 @@ def test_torch_compile_graph_breaks(self): # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138 counters.clear() - inputs = self.get_dummy_inputs() + inputs = self.get_dummy_inputs(torch_device) components = self.get_dummy_components() pipe = self.pipeline_class(**components).to(torch_device) - if getattr(pipe, "unet", None) is None: + if getattr(pipe, "unet", None) is not None: pipe.unet = torch.compile(pipe.unet, fullgraph=True) else: pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) From 6791037c6451409de879b3fa3c73dc2b04d94100 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 21 Mar 2025 08:53:04 +0530 Subject: [PATCH 3/6] recompilation and graph break. --- tests/pipelines/test_pipelines_common.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index cf3eab43aa9b..f51048f150d9 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -13,7 +13,6 @@ import torch.nn as nn from huggingface_hub import ModelCard, delete_repo from huggingface_hub.utils import is_jinja_available -from torch._dynamo.utils import counters from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer import diffusers @@ -2159,7 +2158,7 @@ def test_StableDiffusionMixin_component(self): @require_torch_gpu @slow - def test_torch_compile_recompilation(self): + def test_torch_compile_recompilation_and_graph_break(self): inputs = self.get_dummy_inputs(torch_device) components = self.get_dummy_components() @@ -2172,26 +2171,6 @@ def test_torch_compile_recompilation(self): with torch._dynamo.config.patch(error_on_recompile=True): _ = pipe(**inputs) - @require_torch_gpu - @slow - def test_torch_compile_graph_breaks(self): - # Inspired by: - # https://github.com/pytorch/pytorch/blob/916e8979d3e0d651a9091732ce3e59da32e72b0e/test/dynamo/test_higher_order_ops.py#L138 - counters.clear() - - inputs = self.get_dummy_inputs(torch_device) - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components).to(torch_device) - if getattr(pipe, "unet", None) is not None: - pipe.unet = torch.compile(pipe.unet, fullgraph=True) - else: - pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) - - _ = pipe(**inputs) - num_graph_breaks = len(counters["graph_break"].keys()) - self.assertEqual(num_graph_breaks, 0) - @require_hf_hub_version_greater("0.26.5") @require_transformers_version_greater("4.47.1") def test_save_load_dduf(self, atol=1e-4, rtol=1e-4): From c7f153a2a23c69322039dec77f322ade3ec9afa3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 14 Apr 2025 13:28:30 +0530 Subject: [PATCH 4/6] clear compilation cache. --- tests/pipelines/test_pipelines_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 71b67c6f5d30..eeb245b485de 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1112,6 +1112,7 @@ def callback_cfg_params(self) -> frozenset: def setUp(self): # clean up the VRAM before each test super().setUp() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) @@ -2167,6 +2168,7 @@ def test_StableDiffusionMixin_component(self): @require_torch_gpu @slow def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() inputs = self.get_dummy_inputs(torch_device) components = self.get_dummy_components() From e0566e6890660d8523f32b310ca031e7a04fdfe0 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 14 Apr 2025 17:29:54 +0530 Subject: [PATCH 5/6] change to modeling level test. --- tests/models/test_modeling_common.py | 29 +++++++++++++++++++ .../test_models_transformer_flux.py | 4 +-- tests/pipelines/test_pipelines_common.py | 17 ----------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index f82a2407f333..493405219c4e 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1714,6 +1714,35 @@ def test_push_to_hub_library_name(self): delete_repo(self.repo_id, token=TOKEN) +class TorchCompileTesterMixin: + def setUp(self): + # clean up the VRAM before each test + super().setUp() + torch._dynamo.reset() + gc.collect() + backend_empty_cache(torch_device) + + def tearDown(self): + # clean up the VRAM after each test in case of CUDA runtime errors + super().tearDown() + torch._dynamo.reset() + gc.collect() + backend_empty_cache(torch_device) + + @require_torch_gpu + @require_torch_2 + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + + @slow @require_torch_2 @require_torch_accelerator diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py index c88b3dac8216..f767d2196e7c 100644 --- a/tests/models/transformers/test_models_transformer_flux.py +++ b/tests/models/transformers/test_models_transformer_flux.py @@ -22,7 +22,7 @@ from diffusers.models.embeddings import ImageProjection from diffusers.utils.testing_utils import enable_full_determinism, torch_device -from ..test_modeling_common import ModelTesterMixin +from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin enable_full_determinism() @@ -78,7 +78,7 @@ def create_flux_ip_adapter_state_dict(model): return ip_state_dict -class FluxTransformerTests(ModelTesterMixin, unittest.TestCase): +class FluxTransformerTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): model_class = FluxTransformer2DModel main_input_name = "hidden_states" # We override the items here because the transformer under consideration is small. diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index eeb245b485de..eb420d1d2f12 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -56,7 +56,6 @@ require_torch_gpu, require_transformers_version_greater, skip_mps, - slow, torch_device, ) @@ -2165,22 +2164,6 @@ def test_StableDiffusionMixin_component(self): ) ) - @require_torch_gpu - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - inputs = self.get_dummy_inputs(torch_device) - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components).to(torch_device) - if getattr(pipe, "unet", None) is not None: - pipe.unet = torch.compile(pipe.unet, fullgraph=True) - else: - pipe.transformer = torch.compile(pipe.transformer, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True): - _ = pipe(**inputs) - @require_hf_hub_version_greater("0.26.5") @require_transformers_version_greater("4.47.1") def test_save_load_dduf(self, atol=1e-4, rtol=1e-4): From 87d957dace94c354eaf1deb80e4ed8160f0a8b9e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 15 Apr 2025 07:42:28 +0530 Subject: [PATCH 6/6] allow running compilation tests during nightlies. --- .github/workflows/nightly_tests.yml | 49 ++++++++++++++++++++++++ .github/workflows/release_tests_fast.yml | 2 +- tests/models/test_modeling_common.py | 2 + 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 88343a128bb1..7696852ecd44 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -180,6 +180,55 @@ jobs: pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + run_torch_compile_tests: + name: PyTorch Compile CUDA tests + + runs-on: + group: aws-g4dn-2xlarge + + container: + image: diffusers/diffusers-pytorch-compile-cuda + options: --gpus 0 --shm-size "16gb" --ipc host + + steps: + - name: Checkout diffusers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + - name: Install dependencies + run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test,training] + - name: Environment + run: | + python utils/print_env.py + - name: Run torch compile tests on GPU + env: + HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }} + RUN_COMPILE: yes + run: | + python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/ + - name: Failure short reports + if: ${{ failure() }} + run: cat reports/tests_torch_compile_cuda_failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: torch_compile_test_reports + path: reports + + - name: Generate Report and Notify Channel + if: always() + run: | + pip install slack_sdk tabulate + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + run_big_gpu_torch_tests: name: Torch tests on big GPU strategy: diff --git a/.github/workflows/release_tests_fast.yml b/.github/workflows/release_tests_fast.yml index 27bd9bd9bb42..9d65db2f0dee 100644 --- a/.github/workflows/release_tests_fast.yml +++ b/.github/workflows/release_tests_fast.yml @@ -335,7 +335,7 @@ jobs: - name: Environment run: | python utils/print_env.py - - name: Run example tests on GPU + - name: Run torch compile tests on GPU env: HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }} RUN_COMPILE: yes diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 493405219c4e..a7a42368a84d 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1731,6 +1731,7 @@ def tearDown(self): @require_torch_gpu @require_torch_2 + @is_torch_compile @slow def test_torch_compile_recompilation_and_graph_break(self): torch._dynamo.reset() @@ -1741,6 +1742,7 @@ def test_torch_compile_recompilation_and_graph_break(self): with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): _ = model(**inputs_dict) + _ = model(**inputs_dict) @slow