Skip to content

Commit 1bb7906

Browse files
authored
Merge branch 'main' into add-mod-controlnet-tile-sdxl
2 parents 517bc36 + dcd77ce commit 1bb7906

File tree

72 files changed

+742
-505
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+742
-505
lines changed

docs/source/en/conceptual/evaluation.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ specific language governing permissions and limitations under the License.
1616
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
1717
</a>
1818

19+
> [!TIP]
20+
> This document has now grown outdated given the emergence of existing evaluation frameworks for diffusion models for image generation. Please check
21+
> out works like [HEIM](https://crfm.stanford.edu/helm/heim/latest/), [T2I-Compbench](https://arxiv.org/abs/2307.06350),
22+
> [GenEval](https://arxiv.org/abs/2310.11513).
23+
1924
Evaluation of generative models like [Stable Diffusion](https://huggingface.co/docs/diffusers/stable_diffusion) is subjective in nature. But as practitioners and researchers, we often have to make careful choices amongst many different possibilities. So, when working with different generative models (like GANs, Diffusion, etc.), how do we choose one over the other?
2025

2126
Qualitative evaluation of such models can be error-prone and might incorrectly influence a decision.

src/diffusers/loaders/ip_adapter.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ def load_ip_adapter(
215215
low_cpu_mem_usage=low_cpu_mem_usage,
216216
cache_dir=cache_dir,
217217
local_files_only=local_files_only,
218-
).to(self.device, dtype=self.dtype)
218+
torch_dtype=self.dtype,
219+
).to(self.device)
219220
self.register_modules(image_encoder=image_encoder)
220221
else:
221222
raise ValueError(
@@ -526,8 +527,9 @@ def load_ip_adapter(
526527
low_cpu_mem_usage=low_cpu_mem_usage,
527528
cache_dir=cache_dir,
528529
local_files_only=local_files_only,
530+
dtype=image_encoder_dtype,
529531
)
530-
.to(self.device, dtype=image_encoder_dtype)
532+
.to(self.device)
531533
.eval()
532534
)
533535
self.register_modules(image_encoder=image_encoder)
@@ -805,9 +807,9 @@ def load_ip_adapter(
805807
feature_extractor=SiglipImageProcessor.from_pretrained(image_encoder_subfolder, **kwargs).to(
806808
self.device, dtype=self.dtype
807809
),
808-
image_encoder=SiglipVisionModel.from_pretrained(image_encoder_subfolder, **kwargs).to(
809-
self.device, dtype=self.dtype
810-
),
810+
image_encoder=SiglipVisionModel.from_pretrained(
811+
image_encoder_subfolder, torch_dtype=self.dtype, **kwargs
812+
).to(self.device),
811813
)
812814
else:
813815
raise ValueError(

src/diffusers/loaders/lora_conversion_utils.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,3 +1276,74 @@ def remap_single_transformer_blocks_(key, state_dict):
12761276
converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
12771277

12781278
return converted_state_dict
1279+
1280+
1281+
def _convert_non_diffusers_lumina2_lora_to_diffusers(state_dict):
1282+
# Remove "diffusion_model." prefix from keys.
1283+
state_dict = {k[len("diffusion_model.") :]: v for k, v in state_dict.items()}
1284+
converted_state_dict = {}
1285+
1286+
def get_num_layers(keys, pattern):
1287+
layers = set()
1288+
for key in keys:
1289+
match = re.search(pattern, key)
1290+
if match:
1291+
layers.add(int(match.group(1)))
1292+
return len(layers)
1293+
1294+
def process_block(prefix, index, convert_norm):
1295+
# Process attention qkv: pop lora_A and lora_B weights.
1296+
lora_down = state_dict.pop(f"{prefix}.{index}.attention.qkv.lora_A.weight")
1297+
lora_up = state_dict.pop(f"{prefix}.{index}.attention.qkv.lora_B.weight")
1298+
for attn_key in ["to_q", "to_k", "to_v"]:
1299+
converted_state_dict[f"{prefix}.{index}.attn.{attn_key}.lora_A.weight"] = lora_down
1300+
for attn_key, weight in zip(["to_q", "to_k", "to_v"], torch.split(lora_up, [2304, 768, 768], dim=0)):
1301+
converted_state_dict[f"{prefix}.{index}.attn.{attn_key}.lora_B.weight"] = weight
1302+
1303+
# Process attention out weights.
1304+
converted_state_dict[f"{prefix}.{index}.attn.to_out.0.lora_A.weight"] = state_dict.pop(
1305+
f"{prefix}.{index}.attention.out.lora_A.weight"
1306+
)
1307+
converted_state_dict[f"{prefix}.{index}.attn.to_out.0.lora_B.weight"] = state_dict.pop(
1308+
f"{prefix}.{index}.attention.out.lora_B.weight"
1309+
)
1310+
1311+
# Process feed-forward weights for layers 1, 2, and 3.
1312+
for layer in range(1, 4):
1313+
converted_state_dict[f"{prefix}.{index}.feed_forward.linear_{layer}.lora_A.weight"] = state_dict.pop(
1314+
f"{prefix}.{index}.feed_forward.w{layer}.lora_A.weight"
1315+
)
1316+
converted_state_dict[f"{prefix}.{index}.feed_forward.linear_{layer}.lora_B.weight"] = state_dict.pop(
1317+
f"{prefix}.{index}.feed_forward.w{layer}.lora_B.weight"
1318+
)
1319+
1320+
if convert_norm:
1321+
converted_state_dict[f"{prefix}.{index}.norm1.linear.lora_A.weight"] = state_dict.pop(
1322+
f"{prefix}.{index}.adaLN_modulation.1.lora_A.weight"
1323+
)
1324+
converted_state_dict[f"{prefix}.{index}.norm1.linear.lora_B.weight"] = state_dict.pop(
1325+
f"{prefix}.{index}.adaLN_modulation.1.lora_B.weight"
1326+
)
1327+
1328+
noise_refiner_pattern = r"noise_refiner\.(\d+)\."
1329+
num_noise_refiner_layers = get_num_layers(state_dict.keys(), noise_refiner_pattern)
1330+
for i in range(num_noise_refiner_layers):
1331+
process_block("noise_refiner", i, convert_norm=True)
1332+
1333+
context_refiner_pattern = r"context_refiner\.(\d+)\."
1334+
num_context_refiner_layers = get_num_layers(state_dict.keys(), context_refiner_pattern)
1335+
for i in range(num_context_refiner_layers):
1336+
process_block("context_refiner", i, convert_norm=False)
1337+
1338+
core_transformer_pattern = r"layers\.(\d+)\."
1339+
num_core_transformer_layers = get_num_layers(state_dict.keys(), core_transformer_pattern)
1340+
for i in range(num_core_transformer_layers):
1341+
process_block("layers", i, convert_norm=True)
1342+
1343+
if len(state_dict) > 0:
1344+
raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
1345+
1346+
for key in list(converted_state_dict.keys()):
1347+
converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
1348+
1349+
return converted_state_dict

src/diffusers/loaders/lora_pipeline.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
_convert_hunyuan_video_lora_to_diffusers,
4242
_convert_kohya_flux_lora_to_diffusers,
4343
_convert_non_diffusers_lora_to_diffusers,
44+
_convert_non_diffusers_lumina2_lora_to_diffusers,
4445
_convert_xlabs_flux_lora_to_diffusers,
4546
_maybe_map_sgm_blocks_to_diffusers,
4647
)
@@ -3815,7 +3816,6 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin):
38153816

38163817
@classmethod
38173818
@validate_hf_hub_args
3818-
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.lora_state_dict
38193819
def lora_state_dict(
38203820
cls,
38213821
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -3909,6 +3909,11 @@ def lora_state_dict(
39093909
logger.warning(warn_msg)
39103910
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
39113911

3912+
# conversion.
3913+
non_diffusers = any(k.startswith("diffusion_model.") for k in state_dict)
3914+
if non_diffusers:
3915+
state_dict = _convert_non_diffusers_lumina2_lora_to_diffusers(state_dict)
3916+
39123917
return state_dict
39133918

39143919
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights

src/diffusers/quantizers/quantization_config.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@ class QuantizationMethod(str, Enum):
4747
TORCHAO = "torchao"
4848

4949

50+
if is_torchao_available():
51+
from torchao.quantization.quant_primitives import MappingType
52+
53+
class TorchAoJSONEncoder(json.JSONEncoder):
54+
def default(self, obj):
55+
if isinstance(obj, MappingType):
56+
return obj.name
57+
return super().default(obj)
58+
59+
5060
@dataclass
5161
class QuantizationConfigMixin:
5262
"""
@@ -673,4 +683,6 @@ def __repr__(self):
673683
```
674684
"""
675685
config_dict = self.to_dict()
676-
return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
686+
return (
687+
f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True, cls=TorchAoJSONEncoder)}\n"
688+
)

tests/lora/test_lora_layers_sd.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@
3333
)
3434
from diffusers.utils.import_utils import is_accelerate_available
3535
from diffusers.utils.testing_utils import (
36+
backend_empty_cache,
3637
load_image,
3738
nightly,
3839
numpy_cosine_similarity_distance,
3940
require_peft_backend,
40-
require_torch_gpu,
41+
require_torch_accelerator,
4142
slow,
4243
torch_device,
4344
)
@@ -101,7 +102,7 @@ def tearDown(self):
101102
# Keeping this test here makes sense because it doesn't look any integration
102103
# (value assertions on logits).
103104
@slow
104-
@require_torch_gpu
105+
@require_torch_accelerator
105106
def test_integration_move_lora_cpu(self):
106107
path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
107108
lora_id = "takuma104/lora-test-text-encoder-lora-target"
@@ -158,7 +159,7 @@ def test_integration_move_lora_cpu(self):
158159
self.assertTrue(m.weight.device != torch.device("cpu"))
159160

160161
@slow
161-
@require_torch_gpu
162+
@require_torch_accelerator
162163
def test_integration_move_lora_dora_cpu(self):
163164
from peft import LoraConfig
164165

@@ -209,18 +210,18 @@ def test_integration_move_lora_dora_cpu(self):
209210

210211
@slow
211212
@nightly
212-
@require_torch_gpu
213+
@require_torch_accelerator
213214
@require_peft_backend
214215
class LoraIntegrationTests(unittest.TestCase):
215216
def setUp(self):
216217
super().setUp()
217218
gc.collect()
218-
torch.cuda.empty_cache()
219+
backend_empty_cache(torch_device)
219220

220221
def tearDown(self):
221222
super().tearDown()
222223
gc.collect()
223-
torch.cuda.empty_cache()
224+
backend_empty_cache(torch_device)
224225

225226
def test_integration_logits_with_scale(self):
226227
path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
@@ -378,7 +379,7 @@ def test_a1111_with_model_cpu_offload(self):
378379
generator = torch.Generator().manual_seed(0)
379380

380381
pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
381-
pipe.enable_model_cpu_offload()
382+
pipe.enable_model_cpu_offload(device=torch_device)
382383
lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
383384
lora_filename = "light_and_shadow.safetensors"
384385
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
@@ -400,7 +401,7 @@ def test_a1111_with_sequential_cpu_offload(self):
400401
generator = torch.Generator().manual_seed(0)
401402

402403
pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
403-
pipe.enable_sequential_cpu_offload()
404+
pipe.enable_sequential_cpu_offload(device=torch_device)
404405
lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
405406
lora_filename = "light_and_shadow.safetensors"
406407
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
@@ -656,7 +657,7 @@ def test_sd_load_civitai_empty_network_alpha(self):
656657
See: https://github.com/huggingface/diffusers/issues/5606
657658
"""
658659
pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
659-
pipeline.enable_sequential_cpu_offload()
660+
pipeline.enable_sequential_cpu_offload(device=torch_device)
660661
civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors")
661662
pipeline.load_lora_weights(civitai_path, adapter_name="ahri")
662663

tests/lora/test_lora_layers_sd3.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@
3030
from diffusers.utils import load_image
3131
from diffusers.utils.import_utils import is_accelerate_available
3232
from diffusers.utils.testing_utils import (
33+
backend_empty_cache,
3334
is_flaky,
3435
nightly,
3536
numpy_cosine_similarity_distance,
3637
require_big_gpu_with_torch_cuda,
3738
require_peft_backend,
38-
require_torch_gpu,
39+
require_torch_accelerator,
3940
torch_device,
4041
)
4142

@@ -93,7 +94,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
9394
def output_shape(self):
9495
return (1, 32, 32, 3)
9596

96-
@require_torch_gpu
97+
@require_torch_accelerator
9798
def test_sd3_lora(self):
9899
"""
99100
Test loading the loras that are saved with the diffusers and peft formats.
@@ -135,7 +136,7 @@ def test_multiple_wrong_adapter_name_raises_error(self):
135136

136137

137138
@nightly
138-
@require_torch_gpu
139+
@require_torch_accelerator
139140
@require_peft_backend
140141
@require_big_gpu_with_torch_cuda
141142
@pytest.mark.big_gpu_with_torch_cuda
@@ -146,12 +147,12 @@ class SD3LoraIntegrationTests(unittest.TestCase):
146147
def setUp(self):
147148
super().setUp()
148149
gc.collect()
149-
torch.cuda.empty_cache()
150+
backend_empty_cache(torch_device)
150151

151152
def tearDown(self):
152153
super().tearDown()
153154
gc.collect()
154-
torch.cuda.empty_cache()
155+
backend_empty_cache(torch_device)
155156

156157
def get_inputs(self, device, seed=0):
157158
init_image = load_image(

0 commit comments

Comments
 (0)