update

DN6 · DN6 · commit e9303a0198b3 · 2024-12-05T10:30:46.000+01:00
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -81,8 +81,14 @@
     "open_clip_sd3": "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight",
     "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight",
     "stable_cascade_stage_c": "clip_txt_mapper.weight",
-    "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
-    "sd35_large": "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight",
+    "sd3": [
+        "joint_blocks.0.context_block.adaLN_modulation.1.bias",
+        "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
+    ],
+    "sd35_large": [
+        "joint_blocks.37.x_block.mlp.fc1.weight",
+        "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight",
+    ],
     "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe",
     "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias",
     "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
@@ -529,13 +535,20 @@ def infer_diffusers_model_type(checkpoint):
     ):
         model_type = "stable_cascade_stage_b"
 
-    elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216:
-        if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864:
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd3"]) and any(
+        checkpoint[key].shape[-1] == 9216 if key in checkpoint else False for key in CHECKPOINT_KEY_NAMES["sd3"]
+    ):
+        if "model.diffusion_model.pos_embed" in checkpoint:
+            key = "model.diffusion_model.pos_embed"
+        else:
+            key = "pos_embed"
+
+        if checkpoint[key].shape[1] == 36864:
             model_type = "sd3"
-        elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456:
+        elif checkpoint[key].shape[1] == 147456:
             model_type = "sd35_medium"
 
-    elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint:
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd35_large"]):
         model_type = "sd35_large"
 
     elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint:
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig
+from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, SD3Transformer2DModel
 from diffusers.utils.testing_utils import (
     is_gguf_available,
     nightly,
@@ -22,45 +22,16 @@
 @require_big_gpu_with_torch_cuda
 @require_accelerate
 @require_gguf_version_greater_or_equal("0.10.0")
-class GGUFSingleFileTests(unittest.TestCase):
-    ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+class GGUFSingleFileTesterMixin:
+    ckpt_path = None
+    model_cls = None
     torch_dtype = torch.bfloat16
-
-    def setUp(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_dummy_inputs(self):
-        return {
-            "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "encoder_hidden_states": torch.randn(
-                (1, 512, 4096),
-                generator=torch.Generator("cpu").manual_seed(0),
-            ).to(torch_device, self.torch_dtype),
-            "pooled_projections": torch.randn(
-                (1, 768),
-                generator=torch.Generator("cpu").manual_seed(0),
-            ).to(torch_device, self.torch_dtype),
-            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
-            "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
-        }
+    expected_memory_use_in_gb = 5
 
     def test_gguf_parameters(self):
         quant_storage_type = torch.uint8
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for param_name, param in model.named_parameters():
             if isinstance(param, GGUFParameter):
@@ -69,7 +40,7 @@ def test_gguf_parameters(self):
 
     def test_gguf_linear_layers(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for name, module in model.named_modules():
             if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"):
@@ -78,29 +49,29 @@ def test_gguf_linear_layers(self):
     def test_gguf_memory_usage(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
 
-        model = FluxTransformer2DModel.from_single_file(
+        model = self.model_cls.from_single_file(
             self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
         )
         model.to("cuda")
-        assert (model.get_memory_footprint() / 1024**3) < 5
+        assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb
         inputs = self.get_dummy_inputs()
 
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.empty_cache()
         with torch.no_grad():
             model(**inputs)
         max_memory = torch.cuda.max_memory_allocated()
-        assert (max_memory / 1024**3) < 5
+        assert (max_memory / 1024**3) < self.expected_memory_use_in_gb
 
     def test_keep_modules_in_fp32(self):
         r"""
         A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32.
         Also ensures if inference works.
         """
-        FluxTransformer2DModel._keep_in_fp32_modules = ["proj_out"]
+        self.model_cls._keep_in_fp32_modules = ["proj_out"]
 
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for name, module in model.named_modules():
             if isinstance(module, torch.nn.Linear):
@@ -109,7 +80,7 @@ def test_keep_modules_in_fp32(self):
 
     def test_dtype_assignment(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         with self.assertRaises(ValueError):
             # Tries with a `dtype`
@@ -129,3 +100,103 @@ def test_dtype_assignment(self):
 
         # This should work
         model.to("cuda")
+
+
+class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = FluxTransformer2DModel
+    expected_memory_use_in_gb = 5
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 768),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+            "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
+        }
+
+
+class SD35LargeGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-large-gguf/blob/main/sd3.5_large-Q4_0.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = SD3Transformer2DModel
+    expected_memory_use_in_gb = 5
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 2048),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+        }
+
+
+class SD35MediumGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-medium-gguf/blob/main/sd3.5_medium-Q3_K_M.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = SD3Transformer2DModel
+    expected_memory_use_in_gb = 2
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 2048),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+        }