Merge branch 'main' into lora-bnb-tests-peft

sayakpaul · web-flow · commit 263422ab8c8e · 2025-03-15T12:33:01.000+05:30
diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
@@ -1283,8 +1283,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
 
                 # Get the text embedding for conditioning
-                prompt_embeds = batch["prompt_embeds"]
-                pooled_prompt_embeds = batch["pooled_prompt_embeds"]
+                prompt_embeds = batch["prompt_embeds"].to(dtype=weight_dtype)
+                pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(dtype=weight_dtype)
 
                 # controlnet(s) inference
                 controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
diff --git a/examples/research_projects/pytorch_xla/inference/flux/README.md b/examples/research_projects/pytorch_xla/inference/flux/README.md
diff --git a/examples/research_projects/pytorch_xla/inference/flux/flux_inference.py b/examples/research_projects/pytorch_xla/inference/flux/flux_inference.py
@@ -9,6 +9,7 @@
 import torch_xla.debug.profiler as xp
 import torch_xla.distributed.xla_multiprocessing as xmp
 import torch_xla.runtime as xr
+from torch_xla.experimental.custom_kernel import FlashAttention
 
 from diffusers import FluxPipeline
 
@@ -36,6 +37,19 @@ def _main(index, args, text_pipe, ckpt_id):
         ckpt_id, text_encoder=None, tokenizer=None, text_encoder_2=None, tokenizer_2=None, torch_dtype=torch.bfloat16
     ).to(device0)
     flux_pipe.transformer.enable_xla_flash_attention(partition_spec=("data", None, None, None), is_flux=True)
+    FlashAttention.DEFAULT_BLOCK_SIZES = {
+        "block_q": 1536,
+        "block_k_major": 1536,
+        "block_k": 1536,
+        "block_b": 1536,
+        "block_q_major_dkv": 1536,
+        "block_k_major_dkv": 1536,
+        "block_q_dkv": 1536,
+        "block_k_dkv": 1536,
+        "block_q_dq": 1536,
+        "block_k_dq": 1536,
+        "block_k_major_dq": 1536,
+    }
 
     prompt = "photograph of an electronics chip in the shape of a race car with trillium written on its side"
     width = args.width
@@ -69,14 +83,14 @@ def _main(index, args, text_pipe, ckpt_id):
     xm.set_rng_state(seed=unique_seed, device=device0)
     times = []
     logger.info("starting inference run...")
+    with torch.no_grad():
+        prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
+            prompt=prompt, prompt_2=None, max_sequence_length=512
+        )
+    prompt_embeds = prompt_embeds.to(device0)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
     for _ in range(args.itters):
         ts = perf_counter()
-        with torch.no_grad():
-            prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
-                prompt=prompt, prompt_2=None, max_sequence_length=512
-            )
-        prompt_embeds = prompt_embeds.to(device0)
-        pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
 
         if args.profile:
             xp.trace_detached(f"localhost:{profiler_port}", str(profile_path), duration_ms=profile_duration)
@@ -92,7 +106,7 @@ def _main(index, args, text_pipe, ckpt_id):
         if index == 0:
             logger.info(f"inference time: {inference_time}")
         times.append(inference_time)
-    logger.info(f"avg. inference over {args.itters} iterations took {sum(times)/len(times)} sec.")
+    logger.info(f"avg. inference over {args.itters} iterations took {sum(times) / len(times)} sec.")
     image.save(f"/tmp/inference_out-{index}.png")
     if index == 0:
         metrics_report = met.metrics_report()
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -1355,6 +1355,7 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
     original_state_dict = {k[len("diffusion_model.") :]: v for k, v in state_dict.items()}
 
     num_blocks = len({k.split("blocks.")[1].split(".")[0] for k in original_state_dict})
+    is_i2v_lora = any("k_img" in k for k in original_state_dict) and any("v_img" in k for k in original_state_dict)
 
     for i in range(num_blocks):
         # Self-attention
@@ -1374,13 +1375,15 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
             converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = original_state_dict.pop(
                 f"blocks.{i}.cross_attn.{o}.lora_B.weight"
             )
-        for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = original_state_dict.pop(
-                f"blocks.{i}.cross_attn.{o}.lora_A.weight"
-            )
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = original_state_dict.pop(
-                f"blocks.{i}.cross_attn.{o}.lora_B.weight"
-            )
+
+        if is_i2v_lora:
+            for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = original_state_dict.pop(
+                    f"blocks.{i}.cross_attn.{o}.lora_A.weight"
+                )
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = original_state_dict.pop(
+                    f"blocks.{i}.cross_attn.{o}.lora_B.weight"
+                )
 
         # FFN
         for o, c in zip(["ffn.0", "ffn.2"], ["net.0.proj", "net.2"]):
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -2339,7 +2339,9 @@ def __call__(
             query = apply_rotary_emb(query, image_rotary_emb)
             key = apply_rotary_emb(key, image_rotary_emb)
 
-        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -1610,7 +1610,7 @@ def _get_signature_keys(cls, obj):
                 expected_modules.add(name)
                 optional_parameters.remove(name)
 
-        return expected_modules, optional_parameters
+        return sorted(expected_modules), sorted(optional_parameters)
 
     @classmethod
     def _get_signature_types(cls):
@@ -1652,10 +1652,12 @@ def components(self) -> Dict[str, Any]:
             k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
         }
 
-        if set(components.keys()) != expected_modules:
+        actual = sorted(set(components.keys()))
+        expected = sorted(expected_modules)
+        if actual != expected:
             raise ValueError(
                 f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
-                f" {expected_modules} to be defined, but {components.keys()} are defined."
+                f" {expected} to be defined, but {actual} are defined."
             )
 
         return components
diff --git a/tests/pipelines/test_pipeline_utils.py b/tests/pipelines/test_pipeline_utils.py
@@ -19,7 +19,7 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.pipeline_loading_utils import is_safetensors_compatible, variant_compatible_siblings
-from diffusers.utils.testing_utils import torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 
 
 class IsSafetensorsCompatibleTests(unittest.TestCase):
@@ -826,3 +826,104 @@ def test_video_to_video(self):
         with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
             _ = pipe(**inputs)
             self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
+
+
+@require_torch_gpu
+class PipelineDeviceAndDtypeStabilityTests(unittest.TestCase):
+    expected_pipe_device = torch.device("cuda:0")
+    expected_pipe_dtype = torch.float64
+
+    def get_dummy_components_image_generation(self):
+        cross_attention_dim = 8
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def test_deterministic_device(self):
+        components = self.get_dummy_components_image_generation()
+
+        pipe = StableDiffusionPipeline(**components)
+        pipe.to(device=torch_device, dtype=torch.float32)
+
+        pipe.unet.to(device="cpu")
+        pipe.vae.to(device="cuda")
+        pipe.text_encoder.to(device="cuda:0")
+
+        pipe_device = pipe.device
+
+        self.assertEqual(
+            self.expected_pipe_device,
+            pipe_device,
+            f"Wrong expected device. Expected {self.expected_pipe_device}. Got {pipe_device}.",
+        )
+
+    def test_deterministic_dtype(self):
+        components = self.get_dummy_components_image_generation()
+
+        pipe = StableDiffusionPipeline(**components)
+        pipe.to(device=torch_device, dtype=torch.float32)
+
+        pipe.unet.to(dtype=torch.float16)
+        pipe.vae.to(dtype=torch.float32)
+        pipe.text_encoder.to(dtype=torch.float64)
+
+        pipe_dtype = pipe.dtype
+
+        self.assertEqual(
+            self.expected_pipe_dtype,
+            pipe_dtype,
+            f"Wrong expected dtype. Expected {self.expected_pipe_dtype}. Got {pipe_dtype}.",
+        )