Skip to content

Commit a79876d

Browse files
Merge branch 'main' into lora-hot-swapping
2 parents f14146f + e031caf commit a79876d

File tree

101 files changed

+3414
-630
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+3414
-630
lines changed

docs/source/en/conceptual/evaluation.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ specific language governing permissions and limitations under the License.
1616
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
1717
</a>
1818

19+
> [!TIP]
20+
> This document has now grown outdated given the emergence of existing evaluation frameworks for diffusion models for image generation. Please check
21+
> out works like [HEIM](https://crfm.stanford.edu/helm/heim/latest/), [T2I-Compbench](https://arxiv.org/abs/2307.06350),
22+
> [GenEval](https://arxiv.org/abs/2310.11513).
23+
1924
Evaluation of generative models like [Stable Diffusion](https://huggingface.co/docs/diffusers/stable_diffusion) is subjective in nature. But as practitioners and researchers, we often have to make careful choices amongst many different possibilities. So, when working with different generative models (like GANs, Diffusion, etc.), how do we choose one over the other?
2025

2126
Qualitative evaluation of such models can be error-prone and might incorrectly influence a decision.

examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def log_validation(
227227
pipeline.set_progress_bar_config(disable=True)
228228

229229
# run inference
230-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
230+
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
231231
autocast_ctx = nullcontext()
232232

233233
with autocast_ctx:
@@ -880,9 +880,7 @@ def save_embeddings(self, file_path: str):
880880
idx_to_text_encoder_name = {0: "clip_l", 1: "t5"}
881881
for idx, text_encoder in enumerate(self.text_encoders):
882882
train_ids = self.train_ids if idx == 0 else self.train_ids_t5
883-
embeds = (
884-
text_encoder.text_model.embeddings.token_embedding if idx == 0 else text_encoder.encoder.embed_tokens
885-
)
883+
embeds = text_encoder.text_model.embeddings.token_embedding if idx == 0 else text_encoder.shared
886884
assert embeds.weight.data.shape[0] == len(self.tokenizers[idx]), "Tokenizers should be the same."
887885
new_token_embeddings = embeds.weight.data[train_ids]
888886

@@ -904,9 +902,7 @@ def device(self):
904902
@torch.no_grad()
905903
def retract_embeddings(self):
906904
for idx, text_encoder in enumerate(self.text_encoders):
907-
embeds = (
908-
text_encoder.text_model.embeddings.token_embedding if idx == 0 else text_encoder.encoder.embed_tokens
909-
)
905+
embeds = text_encoder.text_model.embeddings.token_embedding if idx == 0 else text_encoder.shared
910906
index_no_updates = self.embeddings_settings[f"index_no_updates_{idx}"]
911907
embeds.weight.data[index_no_updates] = (
912908
self.embeddings_settings[f"original_embeddings_{idx}"][index_no_updates]
@@ -1749,7 +1745,7 @@ def load_model_hook(models, input_dir):
17491745
if args.enable_t5_ti: # whether to do pivotal tuning/textual inversion for T5 as well
17501746
text_lora_parameters_two = []
17511747
for name, param in text_encoder_two.named_parameters():
1752-
if "token_embedding" in name:
1748+
if "shared" in name:
17531749
# ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
17541750
param.data = param.to(dtype=torch.float32)
17551751
param.requires_grad = True

examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,7 +1883,11 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
18831883
pipeline.set_progress_bar_config(disable=True)
18841884

18851885
# run inference
1886-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
1886+
generator = (
1887+
torch.Generator(device=accelerator.device).manual_seed(args.seed)
1888+
if args.seed is not None
1889+
else None
1890+
)
18871891
pipeline_args = {"prompt": args.validation_prompt}
18881892

18891893
if torch.backends.mps.is_available():
@@ -1987,7 +1991,9 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
19871991
)
19881992
# run inference
19891993
pipeline = pipeline.to(accelerator.device)
1990-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
1994+
generator = (
1995+
torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
1996+
)
19911997
images = [
19921998
pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
19931999
for _ in range(args.num_validation_images)

examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def log_validation(
269269
pipeline.set_progress_bar_config(disable=True)
270270

271271
# run inference
272-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
272+
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
273273
# Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
274274
# way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
275275
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:

examples/cogvideo/train_cogvideox_image_to_video_lora.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ def log_validation(
722722
# pipe.set_progress_bar_config(disable=True)
723723

724724
# run inference
725-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
725+
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
726726

727727
videos = []
728728
for _ in range(args.num_validation_videos):

examples/cogvideo/train_cogvideox_lora.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,7 @@ def log_validation(
739739
# pipe.set_progress_bar_config(disable=True)
740740

741741
# run inference
742-
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
742+
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
743743

744744
videos = []
745745
for _ in range(args.num_validation_videos):

examples/community/README.md

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
5353
| Stable Diffusion Mixture Tiling Pipeline SD 1.5 | A pipeline generates cohesive images by integrating multiple diffusion processes, each focused on a specific image region and considering boundary effects for smooth blending | [Stable Diffusion Mixture Tiling Pipeline SD 1.5](#stable-diffusion-mixture-tiling-pipeline-sd-15) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/albarji/mixture-of-diffusers) | [Álvaro B Jiménez](https://github.com/albarji/) |
5454
| Stable Diffusion Mixture Canvas Pipeline SD 1.5 | A pipeline generates cohesive images by integrating multiple diffusion processes, each focused on a specific image region and considering boundary effects for smooth blending. Works by defining a list of Text2Image region objects that detail the region of influence of each diffuser. | [Stable Diffusion Mixture Canvas Pipeline SD 1.5](#stable-diffusion-mixture-canvas-pipeline-sd-15) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/albarji/mixture-of-diffusers) | [Álvaro B Jiménez](https://github.com/albarji/) |
5555
| Stable Diffusion Mixture Tiling Pipeline SDXL | A pipeline generates cohesive images by integrating multiple diffusion processes, each focused on a specific image region and considering boundary effects for smooth blending | [Stable Diffusion Mixture Tiling Pipeline SDXL](#stable-diffusion-mixture-tiling-pipeline-sdxl) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/elismasilva/mixture-of-diffusers-sdxl-tiling) | [Eliseu Silva](https://github.com/DEVAIEXP/) |
56+
| Stable Diffusion MoD ControlNet Tile SR Pipeline SDXL | This is an advanced pipeline that leverages ControlNet Tile and Mixture-of-Diffusers techniques, integrating tile diffusion directly into the latent space denoising process. Designed to overcome the limitations of conventional pixel-space tile processing, this pipeline delivers Super Resolution (SR) upscaling for higher-quality images, reduced processing time, and greater adaptability. | [Stable Diffusion MoD ControlNet Tile SR Pipeline SDXL](#stable-diffusion-mod-controlnet-tile-sr-pipeline-sdxl) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/elismasilva/mod-control-tile-upscaler-sdxl) | [Eliseu Silva](https://github.com/DEVAIEXP/) |
5657
| FABRIC - Stable Diffusion with feedback Pipeline | pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipeline](#stable-diffusion-fabric-pipeline) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/stable_diffusion_fabric.ipynb)| [Shauray Singh](https://shauray8.github.io/about_shauray/) |
5758
| sketch inpaint - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion Pipeline](#stable-diffusion-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
5859
| sketch inpaint xl - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion XL Pipeline](#stable-diffusion-xl-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
@@ -2630,6 +2631,103 @@ image = pipe(
26302631

26312632
![mixture_tiling_results](https://huggingface.co/datasets/elismasilva/results/resolve/main/mixture_of_diffusers_sdxl_1.png)
26322633

2634+
### Stable Diffusion MoD ControlNet Tile SR Pipeline SDXL
2635+
2636+
This pipeline implements the [MoD (Mixture-of-Diffusers)]("https://arxiv.org/pdf/2408.06072") tiled diffusion technique and combines it with SDXL's ControlNet Tile process to generate SR images.
2637+
2638+
This works better with 4x scales, but you can try adjusts parameters to higher scales.
2639+
2640+
````python
2641+
import torch
2642+
from diffusers import DiffusionPipeline, ControlNetUnionModel, AutoencoderKL, UniPCMultistepScheduler, UNet2DConditionModel
2643+
from diffusers.utils import load_image
2644+
from PIL import Image
2645+
2646+
device = "cuda"
2647+
2648+
# Initialize the models and pipeline
2649+
controlnet = ControlNetUnionModel.from_pretrained(
2650+
"brad-twinkl/controlnet-union-sdxl-1.0-promax", torch_dtype=torch.float16
2651+
).to(device=device)
2652+
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to(device=device)
2653+
2654+
model_id = "SG161222/RealVisXL_V5.0"
2655+
pipe = DiffusionPipeline.from_pretrained(
2656+
model_id,
2657+
torch_dtype=torch.float16,
2658+
vae=vae,
2659+
controlnet=controlnet,
2660+
custom_pipeline="mod_controlnet_tile_sr_sdxl",
2661+
use_safetensors=True,
2662+
variant="fp16",
2663+
).to(device)
2664+
2665+
unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True)
2666+
2667+
#pipe.enable_model_cpu_offload() # << Enable this if you have limited VRAM
2668+
pipe.enable_vae_tiling() # << Enable this if you have limited VRAM
2669+
pipe.enable_vae_slicing() # << Enable this if you have limited VRAM
2670+
2671+
# Set selected scheduler
2672+
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
2673+
2674+
# Load image
2675+
control_image = load_image("https://huggingface.co/datasets/DEVAIEXP/assets/resolve/main/1.jpg")
2676+
original_height = control_image.height
2677+
original_width = control_image.width
2678+
print(f"Current resolution: H:{original_height} x W:{original_width}")
2679+
2680+
# Pre-upscale image for tiling
2681+
resolution = 4096
2682+
tile_gaussian_sigma = 0.3
2683+
max_tile_size = 1024 # or 1280
2684+
2685+
current_size = max(control_image.size)
2686+
scale_factor = max(2, resolution / current_size)
2687+
new_size = (int(control_image.width * scale_factor), int(control_image.height * scale_factor))
2688+
image = control_image.resize(new_size, Image.LANCZOS)
2689+
2690+
# Update target height and width
2691+
target_height = image.height
2692+
target_width = image.width
2693+
print(f"Target resolution: H:{target_height} x W:{target_width}")
2694+
2695+
# Calculate overlap size
2696+
normal_tile_overlap, border_tile_overlap = pipe.calculate_overlap(target_width, target_height)
2697+
2698+
# Set other params
2699+
tile_weighting_method = pipe.TileWeightingMethod.COSINE.value
2700+
guidance_scale = 4
2701+
num_inference_steps = 35
2702+
denoising_strenght = 0.65
2703+
controlnet_strength = 1.0
2704+
prompt = "high-quality, noise-free edges, high quality, 4k, hd, 8k"
2705+
negative_prompt = "blurry, pixelated, noisy, low resolution, artifacts, poor details"
2706+
2707+
# Image generation
2708+
generated_image = pipe(
2709+
image=image,
2710+
control_image=control_image,
2711+
control_mode=[6],
2712+
controlnet_conditioning_scale=float(controlnet_strength),
2713+
prompt=prompt,
2714+
negative_prompt=negative_prompt,
2715+
normal_tile_overlap=normal_tile_overlap,
2716+
border_tile_overlap=border_tile_overlap,
2717+
height=target_height,
2718+
width=target_width,
2719+
original_size=(original_width, original_height),
2720+
target_size=(target_width, target_height),
2721+
guidance_scale=guidance_scale,
2722+
strength=float(denoising_strenght),
2723+
tile_weighting_method=tile_weighting_method,
2724+
max_tile_size=max_tile_size,
2725+
tile_gaussian_sigma=float(tile_gaussian_sigma),
2726+
num_inference_steps=num_inference_steps,
2727+
)["images"][0]
2728+
````
2729+
![Upscaled](https://huggingface.co/datasets/DEVAIEXP/assets/resolve/main/1_input_4x.png)
2730+
26332731
### TensorRT Inpainting Stable Diffusion Pipeline
26342732

26352733
The TensorRT Pipeline can be used to accelerate the Inpainting Stable Diffusion Inference run.

0 commit comments

Comments
 (0)