diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 696097fd5473..ff915e046946 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: runs-on: group: aws-g6-4xlarge-plus container: - image: diffusers/diffusers-pytorch-cuda + image: diffusers/diffusers-pytorch-compile-cuda options: --shm-size "16gb" --ipc host --gpus 0 steps: - name: Checkout diffusers diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index b73faea231dc..340d8a19e17a 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -41,12 +41,6 @@ jobs: run: | CHANGED_FILES="${{ steps.file_changes.outputs.all }}" for FILE in $CHANGED_FILES; do - # skip anything that isn’t still on disk - if [[ ! -f "$FILE" ]]; then - echo "Skipping removed file $FILE" - continue - fi - if [[ "$FILE" == docker/*Dockerfile ]]; then DOCKER_PATH="${FILE%/Dockerfile}" DOCKER_TAG=$(basename "$DOCKER_PATH") @@ -71,7 +65,7 @@ jobs: image-name: - diffusers-pytorch-cpu - diffusers-pytorch-cuda - - diffusers-pytorch-cuda + - diffusers-pytorch-compile-cuda - diffusers-pytorch-xformers-cuda - diffusers-pytorch-minimum-cuda - diffusers-flax-cpu diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index b4c973711e9d..4f92717df8b7 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -188,7 +188,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-cuda + image: diffusers/diffusers-pytorch-compile-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 7cab08b44fcd..abf825eaa7a0 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -262,7 +262,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-cuda + image: diffusers/diffusers-pytorch-compile-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/.github/workflows/release_tests_fast.yml b/.github/workflows/release_tests_fast.yml index a464381ba48a..9d65db2f0dee 100644 --- a/.github/workflows/release_tests_fast.yml +++ b/.github/workflows/release_tests_fast.yml @@ -316,7 +316,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-cuda + image: diffusers/diffusers-pytorch-compile-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile new file mode 100644 index 000000000000..cb4a9c0f9896 --- /dev/null +++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile @@ -0,0 +1,50 @@ +FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 +LABEL maintainer="Hugging Face" +LABEL repository="diffusers" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get -y update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y bash \ + build-essential \ + git \ + git-lfs \ + curl \ + ca-certificates \ + libsndfile1-dev \ + libgl1 \ + python3.10 \ + python3.10-dev \ + python3-pip \ + python3.10-venv && \ + rm -rf /var/lib/apt/lists + +# make sure to use venv +RUN python3.10 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m uv pip install --no-cache-dir \ + torch \ + torchvision \ + torchaudio \ + invisible_watermark && \ + python3.10 -m pip install --no-cache-dir \ + accelerate \ + datasets \ + hf-doc-builder \ + huggingface-hub \ + hf_transfer \ + Jinja2 \ + librosa \ + numpy==1.26.4 \ + scipy \ + tensorboard \ + transformers \ + hf_transfer + +CMD ["/bin/bash"] diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 8de26212a247..0b17d7977a41 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin: def setUp(self): # clean up the VRAM before each test super().setUp() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test in case of CUDA runtime errors super().tearDown() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) @@ -1764,17 +1764,13 @@ def tearDown(self): @is_torch_compile @slow def test_torch_compile_recompilation_and_graph_break(self): - torch.compiler.reset() + torch._dynamo.reset() init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**init_dict).to(torch_device) model = torch.compile(model, fullgraph=True) - with ( - torch._inductor.utils.fresh_inductor_cache(), - torch._dynamo.config.patch(error_on_recompile=True), - torch.no_grad(), - ): + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): _ = model(**inputs_dict) _ = model(**inputs_dict) @@ -1802,7 +1798,7 @@ def tearDown(self): # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, # there will be recompilation errors, as torch caches the model when run in the same process. super().tearDown() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) @@ -1919,7 +1915,7 @@ def test_hotswapping_model(self, rank0, rank1): def test_hotswapping_compiled_model_linear(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "to_k", "to_v", "to_out.0"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa @@ -1929,7 +1925,7 @@ def test_hotswapping_compiled_model_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["conv", "conv1", "conv2"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa @@ -1939,7 +1935,7 @@ def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "conv"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video.py b/tests/models/transformers/test_models_transformer_hunyuan_video.py index 5c83d22ab6aa..0a917352164c 100644 --- a/tests/models/transformers/test_models_transformer_hunyuan_video.py +++ b/tests/models/transformers/test_models_transformer_hunyuan_video.py @@ -19,16 +19,20 @@ from diffusers import HunyuanVideoTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, + is_torch_compile, + require_torch_2, + require_torch_gpu, + slow, torch_device, ) -from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin +from ..test_modeling_common import ModelTesterMixin enable_full_determinism() -class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): +class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -92,8 +96,23 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() -class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + + +class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -160,8 +179,23 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + -class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): +class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -226,10 +260,23 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() -class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests( - ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase -): + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + + +class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -295,3 +342,18 @@ def test_output(self): def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) diff --git a/tests/models/transformers/test_models_transformer_wan.py b/tests/models/transformers/test_models_transformer_wan.py index 4eadb892364a..8270c2ee21b0 100644 --- a/tests/models/transformers/test_models_transformer_wan.py +++ b/tests/models/transformers/test_models_transformer_wan.py @@ -19,16 +19,20 @@ from diffusers import WanTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, + is_torch_compile, + require_torch_2, + require_torch_gpu, + slow, torch_device, ) -from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin +from ..test_modeling_common import ModelTesterMixin enable_full_determinism() -class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): +class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase): model_class = WanTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -82,3 +86,18 @@ def prepare_init_args_and_inputs_for_common(self): def test_gradient_checkpointing_is_applied(self): expected_set = {"WanTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index a2951a8b4673..bb21c9ac8dcb 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -15,6 +15,7 @@ import gc import tempfile +import traceback import unittest import numpy as np @@ -38,9 +39,13 @@ backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, + get_python_version, + is_torch_compile, load_image, load_numpy, + require_torch_2, require_torch_accelerator, + run_test_in_subprocess, slow, torch_device, ) @@ -63,6 +68,52 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_stable_diffusion_compile(in_queue, out_queue, timeout): + error = None + try: + _ = in_queue.get(timeout=timeout) + + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") + + pipe = StableDiffusionControlNetPipeline.from_pretrained( + "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.controlnet.to(memory_format=torch.channels_last) + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) + + generator = torch.Generator(device="cpu").manual_seed(0) + prompt = "bird" + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ).resize((512, 512)) + + output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np") + image = output.images[0] + + assert image.shape == (512, 512, 3) + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" + ) + expected_image = np.resize(expected_image, (512, 512, 3)) + + assert np.abs(expected_image - image).max() < 1.0 + + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class ControlNetPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -1002,6 +1053,15 @@ def test_canny_guess_mode_euler(self): expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @is_torch_compile + @require_torch_2 + @unittest.skipIf( + get_python_version == (3, 12), + reason="Torch Dynamo isn't yet supported for Python 3.12.", + ) + def test_stable_diffusion_compile(self): + run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) + def test_v11_shuffle_global_pool_conditions(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle") diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 6f8422797cce..74af4b6775cc 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -14,6 +14,7 @@ # limitations under the License. import gc +import traceback import unittest import numpy as np @@ -35,9 +36,13 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, enable_full_determinism, + is_torch_compile, load_image, + load_numpy, require_accelerator, + require_torch_2, require_torch_accelerator, + run_test_in_subprocess, slow, torch_device, ) @@ -73,6 +78,53 @@ def to_np(tensor): return tensor +# Will be run via run_test_in_subprocess +def _test_stable_diffusion_compile(in_queue, out_queue, timeout): + error = None + try: + _ = in_queue.get(timeout=timeout) + + controlnet = ControlNetXSAdapter.from_pretrained( + "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16 + ) + pipe = StableDiffusionControlNetXSPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", + controlnet=controlnet, + safety_checker=None, + torch_dtype=torch.float16, + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + generator = torch.Generator(device="cpu").manual_seed(0) + prompt = "bird" + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ).resize((512, 512)) + + output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np") + image = output.images[0] + + assert image.shape == (512, 512, 3) + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" + ) + expected_image = np.resize(expected_image, (512, 512, 3)) + + assert np.abs(expected_image - image).max() < 1.0 + + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class ControlNetXSPipelineFastTests( PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, @@ -350,3 +402,8 @@ def test_depth(self): original_image = image[-3:, -3:, -1].flatten() expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941]) assert np.allclose(original_image, expected_image, atol=1e-04) + + @is_torch_compile + @require_torch_2 + def test_stable_diffusion_compile(self): + run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 2c6739c8ef9f..3b5c7a24b4ca 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -17,6 +17,7 @@ import gc import tempfile import time +import traceback import unittest import numpy as np @@ -48,12 +49,16 @@ backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, + is_torch_compile, + load_image, load_numpy, nightly, numpy_cosine_similarity_distance, require_accelerate_version_greater, + require_torch_2, require_torch_accelerator, require_torch_multi_accelerator, + run_test_in_subprocess, skip_mps, slow, torch_device, @@ -76,6 +81,39 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_stable_diffusion_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe = sd_pipe.to(torch_device) + + sd_pipe.unet.to(memory_format=torch.channels_last) + sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True) + + sd_pipe.set_progress_bar_config(disable=None) + + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239]) + + assert np.abs(image_slice - expected_slice).max() < 5e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class StableDiffusionPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -1186,6 +1224,40 @@ def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self): max_diff = np.abs(expected_image - image).max() assert max_diff < 8e-1 + @is_torch_compile + @require_torch_2 + def test_stable_diffusion_compile(self): + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs) + + def test_stable_diffusion_lcm(self): + unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet") + sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device) + sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + inputs["num_inference_steps"] = 6 + inputs["output_type"] = "pil" + + image = sd_pipe(**inputs).images[0] + + expected_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png" + ) + + image = sd_pipe.image_processor.pil_to_numpy(image) + expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image) + + max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) + + assert max_diff < 1e-2 + @slow @require_torch_accelerator diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 094e98d09ef9..82b01a74869a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -15,6 +15,7 @@ import gc import random +import traceback import unittest import numpy as np @@ -40,10 +41,13 @@ backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, + is_torch_compile, load_image, load_numpy, nightly, + require_torch_2, require_torch_accelerator, + run_test_in_subprocess, skip_mps, slow, torch_device, @@ -66,6 +70,38 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_img2img_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.unet.set_default_attn_processor() + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 768, 3) + expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806]) + + assert np.abs(expected_slice - image_slice).max() < 1e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class StableDiffusionImg2ImgPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -618,6 +654,17 @@ def test_img2img_safety_checker_works(self): assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}" assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros + @is_torch_compile + @require_torch_2 + def test_img2img_compile(self): + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs) + @nightly @require_torch_accelerator diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 8456994d6f81..e028b4017860 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -15,6 +15,7 @@ import gc import random +import traceback import unittest import numpy as np @@ -43,10 +44,13 @@ backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, + is_torch_compile, load_image, load_numpy, nightly, + require_torch_2, require_torch_accelerator, + run_test_in_subprocess, slow, torch_device, ) @@ -67,6 +71,40 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_inpaint_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + pipe = StableDiffusionInpaintPipeline.from_pretrained( + "botp/stable-diffusion-v1-5-inpainting", safety_checker=None + ) + pipe.unet.set_default_attn_processor() + pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + image = pipe(**inputs).images + image_slice = image[0, 253:256, 253:256, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179]) + assert np.abs(expected_slice - image_slice).max() < 3e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class StableDiffusionInpaintPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -689,6 +727,17 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 + @is_torch_compile + @require_torch_2 + def test_inpaint_compile(self): + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs) + def test_stable_diffusion_inpaint_pil_input_resolution_test(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None @@ -915,6 +964,11 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): # make sure that less than 2.45 GB is allocated assert mem_bytes < 2.45 * 10**9 + @is_torch_compile + @require_torch_2 + def test_inpaint_compile(self): + pass + def test_stable_diffusion_inpaint_pil_input_resolution_test(self): vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index f1d9d244e546..ef35ea2678db 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1994,9 +1994,7 @@ def test_from_save_pretrained(self): reason="Torch Dynamo isn't yet supported for Python 3.12.", ) def test_from_save_pretrained_dynamo(self): - torch.compiler.rest() - with torch._inductor.utils.fresh_inductor_cache(): - run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None) + run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None) def test_from_pretrained_hub(self): model_path = "google/ddpm-cifar10-32" @@ -2208,7 +2206,7 @@ def tearDown(self): # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, # there will be recompilation errors, as torch caches the model when run in the same process. super().tearDown() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) @@ -2333,21 +2331,21 @@ def test_hotswapping_pipeline(self, rank0, rank1): def test_hotswapping_compiled_pipline_linear(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "to_k", "to_v", "to_out.0"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["conv", "conv1", "conv2"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "conv"] - with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): + with torch._dynamo.config.patch(error_on_recompile=True): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) def test_enable_lora_hotswap_called_after_adapter_added_raises(self): diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 2b915b9ebba5..af3a832d31a6 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1111,14 +1111,14 @@ def callback_cfg_params(self) -> frozenset: def setUp(self): # clean up the VRAM before each test super().setUp() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test in case of CUDA runtime errors super().tearDown() - torch.compiler.reset() + torch._dynamo.reset() gc.collect() backend_empty_cache(torch_device) diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index dccb1a85008b..b1216a091c8b 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -1,5 +1,6 @@ import gc import random +import traceback import unittest import numpy as np @@ -26,7 +27,9 @@ floats_tensor, load_image, nightly, + require_torch_2, require_torch_accelerator, + run_test_in_subprocess, torch_device, ) from diffusers.utils.torch_utils import randn_tensor @@ -42,6 +45,38 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_unidiffuser_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") + # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe = pipe.to(torch_device) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.set_progress_bar_config(disable=None) + + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) + assert np.abs(image_slice - expected_slice).max() < 1e-1 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class UniDiffuserPipelineFastTests( PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase ): @@ -655,6 +690,19 @@ def test_unidiffuser_default_img2text_v1(self): expected_text_prefix = "An astronaut" assert text[0][: len(expected_text_prefix)] == expected_text_prefix + @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.") + @require_torch_2 + def test_unidiffuser_compile(self, seed=0): + inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs) + @nightly @require_torch_accelerator