Merge branch 'main' into test-sana-lora-training

sayakpaul · web-flow · commit 8eb948765667 · 2024-12-19T10:21:36.000+05:30
diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
@@ -25,9 +25,9 @@ pip install -U gguf
 
 Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`].
 
-When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.unint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`. 
+When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.uint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`.
 
-The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original (`numpy`)[https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py] implementation by [compilade](https://github.com/compilade).
+The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original [`numpy`](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py) implementation by [compilade](https://github.com/compilade).
 
 ```python
 import torch
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -33,8 +33,8 @@ If you are new to the quantization field, we recommend you to check out these be
 ## When to use what?
 
 Diffusers currently supports the following quantization methods.
-- [BitsandBytes]()
-- [TorchAO]()
-- [GGUF]()
+- [BitsandBytes](./bitsandbytes.md)
+- [TorchAO](./torchao.md)
+- [GGUF](./gguf.md)
 
 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -151,6 +151,8 @@
     "animatediff_scribble": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-scribble"},
     "animatediff_rgb": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-rgb"},
     "flux-dev": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev"},
+    "flux-fill": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Fill-dev"},
+    "flux-depth": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Depth-dev"},
     "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
     "ltx-video": {"pretrained_model_name_or_path": "Lightricks/LTX-Video"},
     "autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
@@ -587,7 +589,13 @@ def infer_diffusers_model_type(checkpoint):
         if any(
             g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
         ):
-            model_type = "flux-dev"
+            if checkpoint["img_in.weight"].shape[1] == 384:
+                model_type = "flux-fill"
+
+            elif checkpoint["img_in.weight"].shape[1] == 128:
+                model_type = "flux-depth"
+            else:
+                model_type = "flux-dev"
         else:
             model_type = "flux-schnell"
 
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -691,7 +691,7 @@ def _get_positional_embeddings(
             output_type="pt",
         )
         pos_embedding = pos_embedding.flatten(0, 1)
-        joint_pos_embedding = torch.zeros(
+        joint_pos_embedding = pos_embedding.new_zeros(
             1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
         )
         joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py
@@ -188,6 +188,7 @@ def __init__(
         text_encoder: T5EncoderModel,
         tokenizer: T5TokenizerFast,
         transformer: MochiTransformer3DModel,
+        force_zeros_for_empty_prompt: bool = False,
     ):
         super().__init__()
 
@@ -205,10 +206,11 @@ def __init__(
 
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_scale_factor)
         self.tokenizer_max_length = (
-            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 256
         )
         self.default_height = 480
         self.default_width = 848
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
 
     def _get_t5_prompt_embeds(
         self,
@@ -236,7 +238,11 @@ def _get_t5_prompt_embeds(
         text_input_ids = text_inputs.input_ids
         prompt_attention_mask = text_inputs.attention_mask
         prompt_attention_mask = prompt_attention_mask.bool().to(device)
-        if prompt == "" or prompt[-1] == "":
+
+        # The original Mochi implementation zeros out empty negative prompts
+        # but this can lead to overflow when placing the entire pipeline under the autocast context
+        # adding this here so that we can enable zeroing prompts if necessary
+        if self.config.force_zeros_for_empty_prompt and (prompt == "" or prompt[-1] == ""):
             text_input_ids = torch.zeros_like(text_input_ids, device=device)
             prompt_attention_mask = torch.zeros_like(prompt_attention_mask, dtype=torch.bool, device=device)
 
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -289,6 +289,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             sigmas = 1.0 - alphas
             sigmas = np.flip(self.config.flow_shift * sigmas / (1 + (self.config.flow_shift - 1) * sigmas))[:-1].copy()
             timesteps = (sigmas * self.config.num_train_timesteps).copy()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
             sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -291,14 +291,17 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         elif self.config.use_exponential_sigmas:
             sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
             timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         elif self.config.use_beta_sigmas:
             sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
             timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         elif self.config.use_flow_sigmas:
             alphas = np.linspace(1, 1 / self.config.num_train_timesteps, num_inference_steps + 1)
             sigmas = 1.0 - alphas
             sigmas = np.flip(self.config.flow_shift * sigmas / (1 + (self.config.flow_shift - 1) * sigmas))[:-1].copy()
             timesteps = (sigmas * self.config.num_train_timesteps).copy()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
             sigma_max = (
diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -318,6 +318,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
             sigmas = 1.0 - alphas
             sigmas = np.flip(self.config.flow_shift * sigmas / (1 + (self.config.flow_shift - 1) * sigmas))[:-1].copy()
             timesteps = (sigmas * self.config.num_train_timesteps).copy()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
             sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -381,6 +381,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             sigmas = 1.0 - alphas
             sigmas = np.flip(self.config.flow_shift * sigmas / (1 + (self.config.flow_shift - 1) * sigmas))[:-1].copy()
             timesteps = (sigmas * self.config.num_train_timesteps).copy()
+            if self.config.final_sigmas_type == "sigma_min":
+                sigma_last = sigmas[-1]
+            elif self.config.final_sigmas_type == "zero":
+                sigma_last = 0
+            else:
+                raise ValueError(
+                    f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+                )
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
             if self.config.final_sigmas_type == "sigma_min":
diff --git a/tests/lora/test_lora_layers_cogvideox.py b/tests/lora/test_lora_layers_cogvideox.py
@@ -29,17 +29,13 @@
 )
 from diffusers.utils.testing_utils import (
     floats_tensor,
-    is_peft_available,
     is_torch_version,
     require_peft_backend,
     skip_mps,
     torch_device,
 )
 
 
-if is_peft_available():
-    pass
-
 sys.path.append(".")
 
 from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
diff --git a/tests/lora/test_lora_layers_mochi.py b/tests/lora/test_lora_layers_mochi.py
@@ -23,17 +23,13 @@
 from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
 from diffusers.utils.testing_utils import (
     floats_tensor,
-    is_peft_available,
     is_torch_version,
     require_peft_backend,
     skip_mps,
     torch_device,
 )
 
 
-if is_peft_available():
-    pass
-
 sys.path.append(".")
 
 from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
@@ -29,17 +29,13 @@
 from diffusers.utils import load_image
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
-    is_peft_available,
     numpy_cosine_similarity_distance,
     require_peft_backend,
     require_torch_gpu,
     torch_device,
 )
 
 
-if is_peft_available():
-    pass
-
 sys.path.append(".")
 
 from utils import PeftLoraLoaderMixinTests  # noqa: E402
diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py
@@ -275,7 +275,7 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def test_cogvideox(self):
+    def test_mochi(self):
         generator = torch.Generator("cpu").manual_seed(0)
 
         pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16)

Original file line number	Diff line number	Diff line change
`@@ -691,7 +691,7 @@ def _get_positional_embeddings(`
`691`	`691`	`output_type="pt",`
`692`	`692`	`)`
`693`	`693`	`pos_embedding = pos_embedding.flatten(0, 1)`
`694`		`- joint_pos_embedding = torch.zeros(`
	`694`	`+ joint_pos_embedding = pos_embedding.new_zeros(`
`695`	`695`	`1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False`
`696`	`696`	`)`
`697`	`697`	`joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)`