Merge remote-tracking branch 'upstream/main' into dduf

SunMarc · SunMarc · commit c34ce421907c · 2025-01-13T17:42:50.000+01:00
diff --git a/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py b/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py
@@ -765,7 +765,7 @@ def load_model_hook(models, input_dir):
         lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
 
         transformer_state_dict = {
-            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
         }
         transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
         incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
diff --git a/setup.py b/setup.py
@@ -135,6 +135,7 @@
     "transformers>=4.41.2",
     "urllib3<=2.0.0",
     "black",
+    "phonemizer",
 ]
 
 # this is a lookup table with items like:
@@ -227,6 +228,7 @@ def run(self):
     "scipy",
     "torchvision",
     "transformers",
+    "phonemizer",
 )
 extras["torch"] = deps_list("torch", "accelerate")
 
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
@@ -43,4 +43,5 @@
     "transformers": "transformers>=4.41.2",
     "urllib3": "urllib3<=2.0.0",
     "black": "black",
+    "phonemizer": "phonemizer",
 }
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -186,6 +186,7 @@
     "inpainting": 512,
     "inpainting_v2": 512,
     "controlnet": 512,
+    "instruct-pix2pix": 512,
     "v2": 768,
     "v1": 512,
 }
@@ -605,10 +606,14 @@ def infer_diffusers_model_type(checkpoint):
         if any(
             g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
         ):
-            if checkpoint["img_in.weight"].shape[1] == 384:
-                model_type = "flux-fill"
+            if "model.diffusion_model.img_in.weight" in checkpoint:
+                key = "model.diffusion_model.img_in.weight"
+            else:
+                key = "img_in.weight"
 
-            elif checkpoint["img_in.weight"].shape[1] == 128:
+            if checkpoint[key].shape[1] == 384:
+                model_type = "flux-fill"
+            elif checkpoint[key].shape[1] == 128:
                 model_type = "flux-depth"
             else:
                 model_type = "flux-dev"
diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -237,7 +237,7 @@ def disable_vae_slicing(self):
         """
         self.vae.disable_slicing()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
         to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
@@ -249,11 +249,23 @@ def enable_model_cpu_offload(self, gpu_id=0):
         else:
             raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
 
-        device = torch.device(f"cuda:{gpu_id}")
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{gpu_id or torch_device.index}")
 
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+            device_mod = getattr(torch, device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         model_sequence = [
             self.text_encoder.text_model,
diff --git a/src/diffusers/pipelines/latte/pipeline_latte.py b/src/diffusers/pipelines/latte/pipeline_latte.py
@@ -30,6 +30,7 @@
 from ...utils import (
     BACKENDS_MAPPING,
     BaseOutput,
+    deprecate,
     is_bs4_available,
     is_ftfy_available,
     is_torch_xla_available,
@@ -848,7 +849,14 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
-        if not output_type == "latents":
+        if output_type == "latents":
+            deprecation_message = (
+                "Passing `output_type='latents'` is deprecated. Please pass `output_type='latent'` instead."
+            )
+            deprecate("output_type_latents", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "latent"
+
+        if not output_type == "latent":
             video = self.decode_latents(latents, video_length, decode_chunk_size=14)
             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
         else:
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -471,8 +471,8 @@ def test_xformers_attention_forwardGenerator_pass(self):
         pass
 
     def test_dict_tuple_outputs_equivalent(self):
-        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=2e-4)
+        # increase tolerance from 1e-4 -> 3e-4 to account for large composite model
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-4)
 
     def test_inference_batch_single_identical(self):
         # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
diff --git a/tests/single_file/single_file_testing_utils.py b/tests/single_file/single_file_testing_utils.py
@@ -47,6 +47,8 @@ def download_diffusers_config(repo_id, tmpdir):
 
 
 class SDSingleFileTesterMixin:
+    single_file_kwargs = {}
+
     def _compare_component_configs(self, pipe, single_file_pipe):
         for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
             if param_name in ["torch_dtype", "architectures", "_name_or_path"]:
@@ -154,7 +156,7 @@ def test_single_file_components_with_original_config_local_files_only(
         self._compare_component_configs(pipe, single_file_pipe)
 
     def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4):
-        sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None)
+        sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None, **self.single_file_kwargs)
         sf_pipe.unet.set_attn_processor(AttnProcessor())
         sf_pipe.enable_model_cpu_offload(device=torch_device)
 
@@ -170,7 +172,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_d
 
         max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
 
-        assert max_diff < expected_max_diff
+        assert max_diff < expected_max_diff, f"{image.flatten()} != {image_single_file.flatten()}"
 
     def test_single_file_components_with_diffusers_config(
         self,
diff --git a/tests/single_file/test_model_flux_transformer_single_file.py b/tests/single_file/test_model_flux_transformer_single_file.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    FluxTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    require_torch_accelerator,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+@require_torch_accelerator
+class FluxTransformer2DModelSingleFileTests(unittest.TestCase):
+    model_class = FluxTransformer2DModel
+    ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors"
+    alternate_keys_ckpt_paths = ["https://huggingface.co/Comfy-Org/flux1-dev/blob/main/flux1-dev-fp8.safetensors"]
+
+    repo_id = "black-forest-labs/FLUX.1-dev"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def test_single_file_components(self):
+        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer")
+        model_single_file = self.model_class.from_single_file(self.ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
+
+    def test_checkpoint_loading(self):
+        for ckpt_path in self.alternate_keys_ckpt_paths:
+            torch.cuda.empty_cache()
+            model = self.model_class.from_single_file(ckpt_path)
+
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py
@@ -132,6 +132,7 @@ class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCas
         "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/refs/heads/main/configs/generate.yaml"
     )
     repo_id = "timbrooks/instruct-pix2pix"
+    single_file_kwargs = {"extract_ema": True}
 
     def setUp(self):
         super().setUp()

Original file line number	Diff line number	Diff line change
`@@ -765,7 +765,7 @@ def load_model_hook(models, input_dir):`
`765`	`765`	`lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)`
`766`	`766`
`767`	`767`	`transformer_state_dict = {`
`768`		`- f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")`
	`768`	`+ f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")`
`769`	`769`	`}`
`770`	`770`	`transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)`
`771`	`771`	`incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")`
Original file line number	Diff line number	Diff line change
`@@ -135,6 +135,7 @@`
`135`	`135`	`"transformers>=4.41.2",`
`136`	`136`	`"urllib3<=2.0.0",`
`137`	`137`	`"black",`
	`138`	`+ "phonemizer",`
`138`	`139`	`]`
`139`	`140`
`140`	`141`	`# this is a lookup table with items like:`
`@@ -227,6 +228,7 @@ def run(self):`
`227`	`228`	`"scipy",`
`228`	`229`	`"torchvision",`
`229`	`230`	`"transformers",`
	`231`	`+ "phonemizer",`
`230`	`232`	`)`
`231`	`233`	`extras["torch"] = deps_list("torch", "accelerate")`
`232`	`234`
Original file line number	Diff line number	Diff line change
`@@ -43,4 +43,5 @@`
`43`	`43`	`"transformers": "transformers>=4.41.2",`
`44`	`44`	`"urllib3": "urllib3<=2.0.0",`
`45`	`45`	`"black": "black",`
	`46`	`+ "phonemizer": "phonemizer",`
`46`	`47`	`}`
Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,7 @@ class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCas`
`132`	`132`	`"https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/refs/heads/main/configs/generate.yaml"`
`133`	`133`	`)`
`134`	`134`	`repo_id = "timbrooks/instruct-pix2pix"`
	`135`	`+ single_file_kwargs = {"extract_ema": True}`
`135`	`136`
`136`	`137`	`def setUp(self):`
`137`	`138`	`super().setUp()`