huggingface
diff --git a/‎docs/source/en/api/pipelines/flux.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/pipelines/flux.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/README.md‎
Lines changed: 108 additions & 10 deletions b/‎examples/community/README.md‎
Lines changed: 108 additions & 10 deletions
diff --git a/‎examples/community/regional_prompting_stable_diffusion.py‎
Lines changed: 63 additions & 16 deletions b/‎examples/community/regional_prompting_stable_diffusion.py‎
Lines changed: 63 additions & 16 deletions
@@ -148,7 +148,7 @@ image.save("output.png")
 **Note:** `black-forest-labs/Flux.1-Depth-dev` is _not_ a ControlNet model. [`ControlNetModel`] models are a separate component from the UNet/Transformer whose residuals are added to the actual underlying model. Depth Control is an alternate architecture that achieves effectively the same results as a ControlNet model would, by using channel-wise concatenation with input control condition and ensuring the transformer learns structure control by following the condition as closely as possible.
 
 ```python
-# !pip install git+https://github.com/asomoza/image_gen_aux.git
+# !pip install git+https://github.com/huggingface/image_gen_aux
 import torch
 from diffusers import FluxControlPipeline, FluxTransformer2DModel
 from diffusers.utils import load_image
 
@@ -2619,16 +2619,17 @@ for obj in range(bs):
 
 ### Stable Diffusion XL Reference
 
-This pipeline uses the Reference. Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference).
+This pipeline uses the Reference. Refer to the [Stable Diffusion Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference) section for more information.
 
 ```py
 import torch
-from PIL import Image
+# from diffusers import DiffusionPipeline
 from diffusers.utils import load_image
-from diffusers import DiffusionPipeline
 from diffusers.schedulers import UniPCMultistepScheduler
 
-input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+from .stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
+
+input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg")
 
 # pipe = DiffusionPipeline.from_pretrained(
 #     "stabilityai/stable-diffusion-xl-base-1.0",
@@ -2646,22 +2647,22 @@ pipe = StableDiffusionXLReferencePipeline.from_pretrained(
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 
 result_img = pipe(ref_image=input_image,
-      prompt="1girl",
+      prompt="a dog",
       num_inference_steps=20,
       reference_attn=True,
       reference_adain=True).images[0]
 ```
 
 Reference Image
 
-![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
+![reference_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg)
 
 Output Image
 
-`prompt: 1 girl`
+`prompt: a dog`
 
-`reference_attn=True, reference_adain=True, num_inference_steps=20`
-![Output_image](https://github.com/zideliu/diffusers/assets/34944964/743848da-a215-48f9-ae39-b5e2ae49fb13)
+`reference_attn=False, reference_adain=True, num_inference_steps=20`
+![Output_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_adain_dog.png)
 
 Reference Image
 ![reference_image](https://github.com/huggingface/diffusers/assets/34944964/449bdab6-e744-4fb2-9620-d4068d9a741b)
@@ -2683,6 +2684,88 @@ Output Image
 `reference_attn=True, reference_adain=True, num_inference_steps=20`
 ![output_image](https://github.com/huggingface/diffusers/assets/34944964/9b2f1aca-886f-49c3-89ec-d2031c8e3670)
 
+### Stable Diffusion XL ControlNet Reference
+
+This pipeline uses the Reference Control and with ControlNet. Refer to the [Stable Diffusion ControlNet Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-controlnet-reference) and [Stable Diffusion XL Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-xl-reference) sections for more information.
+
+```py
+from diffusers import ControlNetModel, AutoencoderKL
+from diffusers.schedulers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+import numpy as np
+import torch
+
+import cv2
+from PIL import Image
+
+from .stable_diffusion_xl_controlnet_reference import StableDiffusionXLControlNetReferencePipeline
+
+# download an image
+canny_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg"
+)
+
+ref_image = load_image(
+    "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+# initialize the models and pipeline
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipe = StableDiffusionXLControlNetReferencePipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+).to("cuda:0")
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+# get canny image
+image = np.array(canny_image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+# generate image
+image = pipe(
+    prompt="a cat",
+    num_inference_steps=20,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    image=canny_image,
+    ref_image=ref_image,
+    reference_attn=False,
+    reference_adain=True,
+    style_fidelity=1.0,
+    generator=torch.Generator("cuda").manual_seed(42)
+).images[0]
+```
+
+Canny ControlNet Image
+
+![canny_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg)
+
+Reference Image
+
+![ref_image](https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png)
+
+Output Image
+
+`prompt: a cat`
+
+`reference_attn=True, reference_adain=True, num_inference_steps=20, style_fidelity=1.0`
+
+![Output_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_attn_adain_canny_cat.png)
+
+`reference_attn=False, reference_adain=True, num_inference_steps=20, style_fidelity=1.0`
+
+![Output_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_adain_canny_cat.png)
+
+`reference_attn=True, reference_adain=False, num_inference_steps=20, style_fidelity=1.0`
+
+![Output_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_attn_canny_cat.png)
+
 ### Stable diffusion fabric pipeline
 
 FABRIC approach applicable to a wide range of popular diffusion models, which exploits
@@ -3378,6 +3461,20 @@ best quality, 3persons in garden, a boy blue shirt BREAK
 best quality, 3persons in garden, an old man red suit
 ```
 
+### Use base prompt
+
+You can use a base prompt to apply the prompt to all areas. You can set a base prompt by adding `ADDBASE` at the end. Base prompts can also be combined with common prompts, but the base prompt must be specified first.
+
+```
+2d animation style ADDBASE
+masterpiece, high quality ADDCOMM
+(blue sky)++ BREAK
+green hair twintail BREAK
+book shelf BREAK
+messy desk BREAK
+orange++ dress and sofa
+```
+
 ### Negative prompt
 
 Negative prompts are equally effective across all regions, but it is possible to set region-specific prompts for negative prompts as well. The number of BREAKs must be the same as the number of prompts. If the number of prompts does not match, the negative prompts will be used without being divided into regions.
@@ -3408,6 +3505,7 @@ pipe(prompt=prompt, rp_args=rp_args)
 ### Optional Parameters
 
 - `save_mask`: In `Prompt` mode, choose whether to output the generated mask along with the image. The default is `False`.
+- `base_ratio`: Used with `ADDBASE`. Sets the ratio of the base prompt; if base ratio is set to 0.2, then resulting images will consist of `20%*BASE_PROMPT + 80%*REGION_PROMPT`
 
 The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.
 
@@ -4696,4 +4794,4 @@ with torch.no_grad():
 ```
 
 In the folder examples/pixart there is also a script that can be used to train new models.
-Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.
+Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.
@@ -3,20 +3,20 @@
 
 import torch
 import torchvision.transforms.functional as FF
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from diffusers import StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import USE_PEFT_BACKEND
 
 
 try:
     from compel import Compel
 except ImportError:
     Compel = None
 
+KBASE = "ADDBASE"
 KCOMM = "ADDCOMM"
 KBRK = "BREAK"
 
@@ -34,6 +34,11 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
 
         Optional
             rp_args["save_mask"]: True/False (save masks in prompt mode)
+            rp_args["power"]: int (power for attention maps in prompt mode)
+            rp_args["base_ratio"]:
+                float (Sets the ratio of the base prompt)
+                ex) 0.2 (20%*BASE_PROMPT + 80%*REGION_PROMPT)
+                [Use base prompt](https://github.com/hako-mikan/sd-webui-regional-prompter?tab=readme-ov-file#use-base-prompt)
 
     Pipeline for text-to-image generation using Stable Diffusion.
 
@@ -70,6 +75,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__(
@@ -80,6 +86,7 @@ def __init__(
             scheduler,
             safety_checker,
             feature_extractor,
+            image_encoder,
             requires_safety_checker,
         )
         self.register_modules(
@@ -90,6 +97,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
 
     @torch.no_grad()
@@ -110,17 +118,40 @@ def __call__(
         rp_args: Dict[str, str] = None,
     ):
         active = KBRK in prompt[0] if isinstance(prompt, list) else KBRK in prompt
+        use_base = KBASE in prompt[0] if isinstance(prompt, list) else KBASE in prompt
         if negative_prompt is None:
             negative_prompt = "" if isinstance(prompt, str) else [""] * len(prompt)
 
         device = self._execution_device
         regions = 0
 
+        self.base_ratio = float(rp_args["base_ratio"]) if "base_ratio" in rp_args else 0.0
         self.power = int(rp_args["power"]) if "power" in rp_args else 1
 
         prompts = prompt if isinstance(prompt, list) else [prompt]
-        n_prompts = negative_prompt if isinstance(prompt, str) else [negative_prompt]
+        n_prompts = negative_prompt if isinstance(prompt, list) else [negative_prompt]
         self.batch = batch = num_images_per_prompt * len(prompts)
+
+        if use_base:
+            bases = prompts.copy()
+            n_bases = n_prompts.copy()
+
+            for i, prompt in enumerate(prompts):
+                parts = prompt.split(KBASE)
+                if len(parts) == 2:
+                    bases[i], prompts[i] = parts
+                elif len(parts) > 2:
+                    raise ValueError(f"Multiple instances of {KBASE} found in prompt: {prompt}")
+            for i, prompt in enumerate(n_prompts):
+                n_parts = prompt.split(KBASE)
+                if len(n_parts) == 2:
+                    n_bases[i], n_prompts[i] = n_parts
+                elif len(n_parts) > 2:
+                    raise ValueError(f"Multiple instances of {KBASE} found in negative prompt: {prompt}")
+
+            all_bases_cn, _ = promptsmaker(bases, num_images_per_prompt)
+            all_n_bases_cn, _ = promptsmaker(n_bases, num_images_per_prompt)
+
         all_prompts_cn, all_prompts_p = promptsmaker(prompts, num_images_per_prompt)
         all_n_prompts_cn, _ = promptsmaker(n_prompts, num_images_per_prompt)
 
@@ -137,8 +168,16 @@ def getcompelembs(prps):
 
             conds = getcompelembs(all_prompts_cn)
             unconds = getcompelembs(all_n_prompts_cn)
-            embs = getcompelembs(prompts)
-            n_embs = getcompelembs(n_prompts)
+            base_embs = getcompelembs(all_bases_cn) if use_base else None
+            base_n_embs = getcompelembs(all_n_bases_cn) if use_base else None
+            # When using base, it seems more reasonable to use base prompts as prompt_embeddings rather than regional prompts
+            embs = getcompelembs(prompts) if not use_base else base_embs
+            n_embs = getcompelembs(n_prompts) if not use_base else base_n_embs
+
+            if use_base and self.base_ratio > 0:
+                conds = self.base_ratio * base_embs + (1 - self.base_ratio) * conds
+                unconds = self.base_ratio * base_n_embs + (1 - self.base_ratio) * unconds
+
             prompt = negative_prompt = None
         else:
             conds = self.encode_prompt(prompts, device, 1, True)[0]
@@ -147,6 +186,18 @@ def getcompelembs(prps):
                 if equal
                 else self.encode_prompt(all_n_prompts_cn, device, 1, True)[0]
             )
+
+            if use_base and self.base_ratio > 0:
+                base_embs = self.encode_prompt(bases, device, 1, True)[0]
+                base_n_embs = (
+                    self.encode_prompt(n_bases, device, 1, True)[0]
+                    if equal
+                    else self.encode_prompt(all_n_bases_cn, device, 1, True)[0]
+                )
+
+                conds = self.base_ratio * base_embs + (1 - self.base_ratio) * conds
+                unconds = self.base_ratio * base_n_embs + (1 - self.base_ratio) * unconds
+
             embs = n_embs = None
 
         if not active:
@@ -225,8 +276,6 @@ def forward(
 
                     residual = hidden_states
 
-                    args = () if USE_PEFT_BACKEND else (scale,)
-
                     if attn.spatial_norm is not None:
                         hidden_states = attn.spatial_norm(hidden_states, temb)
 
@@ -247,16 +296,15 @@ def forward(
                     if attn.group_norm is not None:
                         hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
-                    args = () if USE_PEFT_BACKEND else (scale,)
-                    query = attn.to_q(hidden_states, *args)
+                    query = attn.to_q(hidden_states)
 
                     if encoder_hidden_states is None:
                         encoder_hidden_states = hidden_states
                     elif attn.norm_cross:
                         encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-                    key = attn.to_k(encoder_hidden_states, *args)
-                    value = attn.to_v(encoder_hidden_states, *args)
+                    key = attn.to_k(encoder_hidden_states)
+                    value = attn.to_v(encoder_hidden_states)
 
                     inner_dim = key.shape[-1]
                     head_dim = inner_dim // attn.heads
@@ -283,7 +331,7 @@ def forward(
                     hidden_states = hidden_states.to(query.dtype)
 
                     # linear proj
-                    hidden_states = attn.to_out[0](hidden_states, *args)
+                    hidden_states = attn.to_out[0](hidden_states)
                     # dropout
                     hidden_states = attn.to_out[1](hidden_states)
 
@@ -410,9 +458,9 @@ def promptsmaker(prompts, batch):
         add = ""
         if KCOMM in prompt:
             add, prompt = prompt.split(KCOMM)
-            add = add + " "
-        prompts = prompt.split(KBRK)
-        out_p.append([add + p for p in prompts])
+            add = add.strip() + " "
+        prompts = [p.strip() for p in prompt.split(KBRK)]
+        out_p.append([add + p for i, p in enumerate(prompts)])
     out = [None] * batch * len(out_p[0]) * len(out_p)
     for p, prs in enumerate(out_p):  # inputs prompts
         for r, pr in enumerate(prs):  # prompts for regions
@@ -449,7 +497,6 @@ def startend(cells, array):
             add = []
             startend(add, inratios[1:])
             icells.append(add)
-
     return ocells, icells, sum(len(cell) for cell in icells)