update

a-r-r-o-w · a-r-r-o-w · commit a10f19c7a458 · 2024-12-24T12:30:47.000+01:00
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -720,7 +720,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if hf_quantizer is not None:
             if device_map is not None:
                 raise NotImplementedError(
-                    "Currently, `device_map` is automatically inferred for quantized models. Support for providing `device_map` as an input will be added in the future."
+                    "Currently, providing `device_map` is not supported for quantized models. Providing `device_map` as an input will be added in the future."
                 )
 
             hf_quantizer.validate_environment(torch_dtype=torch_dtype, from_flax=from_flax, device_map=device_map)
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -629,12 +629,8 @@ def _test_quant_type(self, quantization_config, expected_slice):
         output = pipe(**inputs)[0].flatten()
         output_slice = np.concatenate((output[:16], output[-16:]))
 
-        for weight in [
-            pipe.transformer.x_embedder.weight,
-            pipe.transformer.transformer_blocks[0].ff.net[2].weight,
-            pipe.transformer.transformer_blocks[-1].ff.net[2].weight,
-        ]:
-            self.assertTrue(isinstance(weight, AffineQuantizedTensor))
+        weight = pipe.transformer.x_embedder.weight
+        self.assertTrue(isinstance(weight, AffineQuantizedTensor))
         self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3))
 
     def test_quantization(self):
@@ -643,7 +639,7 @@ def test_quantization(self):
             ("int8wo", np.array([0.0505, 0.0742, 0.1367, 0.0429, 0.0585, 0.1386, 0.0585, 0.0703, 0.1367, 0.0566, 0.0703, 0.1464, 0.0546, 0.0703, 0.1425, 0.0546, 0.3535, 0.7578, 0.5000, 0.4062, 0.7656, 0.5117, 0.4121, 0.7656, 0.5117, 0.3984, 0.7578, 0.5234, 0.4023, 0.7382, 0.5390, 0.4570])),
             ("int8dq", np.array([0.0546, 0.0761, 0.1386, 0.0488, 0.0644, 0.1425, 0.0605, 0.0742, 0.1406, 0.0625, 0.0722, 0.1523, 0.0625, 0.0742, 0.1503, 0.0605, 0.3886, 0.7968, 0.5507, 0.4492, 0.7890, 0.5351, 0.4316, 0.8007, 0.5390, 0.4179, 0.8281, 0.5820, 0.4531, 0.7812, 0.5703, 0.4921])),
         ]
-
+ 
         if TorchAoConfig._is_cuda_capability_atleast_8_9():
             QUANTIZATION_TYPES_TO_TEST.extend([
                 ("float8wo_e4m3", np.array([0.0546, 0.0722, 0.1328, 0.0468, 0.0585, 0.1367, 0.0605, 0.0703, 0.1328, 0.0625, 0.0703, 0.1445, 0.0585, 0.0703, 0.1406, 0.0605, 0.3496, 0.7109, 0.4843, 0.4042, 0.7226, 0.5000, 0.4160, 0.7031, 0.4824, 0.3886, 0.6757, 0.4667, 0.3710, 0.6679, 0.4902, 0.4238])),
@@ -672,10 +668,41 @@ def test_serialization(self):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             pipe.save_pretrained(tmp_dir, safe_serialization=False)
-            loaded_pipe = FluxPipeline.from_pretrained(tmp_dir, use_safetensors=False).to(torch_device)
+            del pipe
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            loaded_pipe = FluxPipeline.from_pretrained(tmp_dir, use_safetensors=False)
+            loaded_pipe.enable_model_cpu_offload()
 
         weight = loaded_pipe.transformer.x_embedder.weight
         self.assertTrue(isinstance(weight, AffineQuantizedTensor))
 
         loaded_output = loaded_pipe(**inputs)[0].flatten()
         self.assertTrue(np.allclose(output, loaded_output, atol=1e-3, rtol=1e-3))
+    
+    def test_memory_footprint_int4wo(self):
+        # The original checkpoints are in bf16 and about 24 GB
+        expected_memory_in_gb = 6.0
+        quantization_config = TorchAoConfig("int4wo")
+        transformer = FluxTransformer2DModel.from_pretrained(
+            "black-forest-labs/FLUX.1-dev",
+            subfolder="transformer",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+        )
+        int4wo_memory_in_gb = get_model_size_in_bytes(transformer) / 1024**3
+        self.assertTrue(int4wo_memory_in_gb < expected_memory_in_gb)
+    
+    def test_memory_footprint_int8wo(self):
+        # The original checkpoints are in bf16 and about 24 GB
+        expected_memory_in_gb = 12.0
+        quantization_config = TorchAoConfig("int8wo")
+        transformer = FluxTransformer2DModel.from_pretrained(
+            "black-forest-labs/FLUX.1-dev",
+            subfolder="transformer",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+        )
+        int8wo_memory_in_gb = get_model_size_in_bytes(transformer) / 1024**3
+        self.assertTrue(int8wo_memory_in_gb < expected_memory_in_gb)

Original file line number	Diff line number	Diff line change
`@@ -720,7 +720,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P`
`720`	`720`	`if hf_quantizer is not None:`
`721`	`721`	`if device_map is not None:`
`722`	`722`	`raise NotImplementedError(`
`723`		- "Currently, `device_map` is automatically inferred for quantized models. Support for providing `device_map` as an input will be added in the future."
	`723`	+ "Currently, providing `device_map` is not supported for quantized models. Providing `device_map` as an input will be added in the future."
`724`	`724`	`)`
`725`	`725`
`726`	`726`	`hf_quantizer.validate_environment(torch_dtype=torch_dtype, from_flax=from_flax, device_map=device_map)`