Skip to content

Commit 65b3719

Browse files
Merge branch 'huggingface:main' into cogview4_control
2 parents 52d4ebf + 3fab662 commit 65b3719

28 files changed

+2098
-385
lines changed

.github/workflows/pr_style_bot.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ jobs:
5353
HEADREF: ${{ steps.pr_info.outputs.headRef }}
5454
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
5555
run: |
56-
echo "PR number: ${{ env.PRNUMBER }}"
57-
echo "Head Ref: ${{ env.HEADREF }}"
58-
echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"
56+
echo "PR number: $PRNUMBER"
57+
echo "Head Ref: $HEADREF"
58+
echo "Head Repo Full Name: $HEADREPOFULLNAME"
5959
6060
- name: Set up Python
6161
uses: actions/setup-python@v4
@@ -89,20 +89,20 @@ jobs:
8989
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
9090
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
9191
run: |
92-
echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
92+
echo "HEADREPOFULLNAME: $HEADREPOFULLNAME, HEADREF: $HEADREF"
9393
# Configure git with the Actions bot user
9494
git config user.name "github-actions[bot]"
9595
git config user.email "github-actions[bot]@users.noreply.github.com"
9696
9797
# Make sure your 'origin' remote is set to the contributor's fork
98-
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
98+
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/$HEADREPOFULLNAME.git"
9999
100100
# If there are changes after running style/quality, commit them
101101
if [ -n "$(git status --porcelain)" ]; then
102102
git add .
103103
git commit -m "Apply style fixes"
104104
# Push to the original contributor's forked branch
105-
git push origin HEAD:${{ env.HEADREF }}
105+
git push origin HEAD:$HEADREF
106106
echo "changes_pushed=true" >> $GITHUB_OUTPUT
107107
else
108108
echo "No changes to commit."

docs/source/en/api/pipelines/marigold.md

Lines changed: 89 additions & 34 deletions
Large diffs are not rendered by default.

docs/source/en/api/pipelines/overview.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
6565
| [Latte](latte) | text2image |
6666
| [LEDITS++](ledits_pp) | image editing |
6767
| [Lumina-T2X](lumina) | text2image |
68-
| [Marigold](marigold) | depth |
68+
| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
6969
| [MultiDiffusion](panorama) | text2image |
7070
| [MusicLDM](musicldm) | text2audio |
7171
| [PAG](pag) | text2image |

docs/source/en/using-diffusers/marigold_usage.md

Lines changed: 312 additions & 173 deletions
Large diffs are not rendered by default.

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@
346346
"Lumina2Text2ImgPipeline",
347347
"LuminaText2ImgPipeline",
348348
"MarigoldDepthPipeline",
349+
"MarigoldIntrinsicsPipeline",
349350
"MarigoldNormalsPipeline",
350351
"MochiPipeline",
351352
"MusicLDMPipeline",
@@ -847,6 +848,7 @@
847848
Lumina2Text2ImgPipeline,
848849
LuminaText2ImgPipeline,
849850
MarigoldDepthPipeline,
851+
MarigoldIntrinsicsPipeline,
850852
MarigoldNormalsPipeline,
851853
MochiPipeline,
852854
MusicLDMPipeline,

src/diffusers/loaders/ip_adapter.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
2424
from ..utils import (
2525
USE_PEFT_BACKEND,
26+
_get_detailed_type,
2627
_get_model_file,
28+
_is_valid_type,
2729
is_accelerate_available,
2830
is_torch_version,
2931
is_transformers_available,
@@ -577,29 +579,36 @@ def LinearStrengthModel(start, finish, size):
577579
pipeline.set_ip_adapter_scale(ip_strengths)
578580
```
579581
"""
580-
transformer = self.transformer
581-
if not isinstance(scale, list):
582-
scale = [[scale] * transformer.config.num_layers]
583-
elif isinstance(scale, list) and isinstance(scale[0], int) or isinstance(scale[0], float):
584-
if len(scale) != transformer.config.num_layers:
585-
raise ValueError(f"Expected list of {transformer.config.num_layers} scales, got {len(scale)}.")
582+
583+
scale_type = Union[int, float]
584+
num_ip_adapters = self.transformer.encoder_hid_proj.num_ip_adapters
585+
num_layers = self.transformer.config.num_layers
586+
587+
# Single value for all layers of all IP-Adapters
588+
if isinstance(scale, scale_type):
589+
scale = [scale for _ in range(num_ip_adapters)]
590+
# List of per-layer scales for a single IP-Adapter
591+
elif _is_valid_type(scale, List[scale_type]) and num_ip_adapters == 1:
586592
scale = [scale]
593+
# Invalid scale type
594+
elif not _is_valid_type(scale, List[Union[scale_type, List[scale_type]]]):
595+
raise TypeError(f"Unexpected type {_get_detailed_type(scale)} for scale.")
587596

588-
scale_configs = scale
597+
if len(scale) != num_ip_adapters:
598+
raise ValueError(f"Cannot assign {len(scale)} scales to {num_ip_adapters} IP-Adapters.")
589599

590-
key_id = 0
591-
for attn_name, attn_processor in transformer.attn_processors.items():
592-
if isinstance(attn_processor, (FluxIPAdapterJointAttnProcessor2_0)):
593-
if len(scale_configs) != len(attn_processor.scale):
594-
raise ValueError(
595-
f"Cannot assign {len(scale_configs)} scale_configs to "
596-
f"{len(attn_processor.scale)} IP-Adapter."
597-
)
598-
elif len(scale_configs) == 1:
599-
scale_configs = scale_configs * len(attn_processor.scale)
600-
for i, scale_config in enumerate(scale_configs):
601-
attn_processor.scale[i] = scale_config[key_id]
602-
key_id += 1
600+
if any(len(s) != num_layers for s in scale if isinstance(s, list)):
601+
invalid_scale_sizes = {len(s) for s in scale if isinstance(s, list)} - {num_layers}
602+
raise ValueError(
603+
f"Expected list of {num_layers} scales, got {', '.join(str(x) for x in invalid_scale_sizes)}."
604+
)
605+
606+
# Scalars are transformed to lists with length num_layers
607+
scale_configs = [[s] * num_layers if isinstance(s, scale_type) else s for s in scale]
608+
609+
# Set scales. zip over scale_configs prevents going into single transformer layers
610+
for attn_processor, *scale in zip(self.transformer.attn_processors.values(), *scale_configs):
611+
attn_processor.scale = scale
603612

604613
def unload_ip_adapter(self):
605614
"""

src/diffusers/models/attention_processor.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2780,9 +2780,8 @@ def __call__(
27802780

27812781
# IP-adapter
27822782
ip_query = hidden_states_query_proj
2783-
ip_attn_output = None
2784-
# for ip-adapter
2785-
# TODO: support for multiple adapters
2783+
ip_attn_output = torch.zeros_like(hidden_states)
2784+
27862785
for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
27872786
ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
27882787
):
@@ -2793,12 +2792,14 @@ def __call__(
27932792
ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
27942793
# the output of sdp = (batch, num_heads, seq_len, head_dim)
27952794
# TODO: add support for attn.scale when we move to Torch 2.1
2796-
ip_attn_output = F.scaled_dot_product_attention(
2795+
current_ip_hidden_states = F.scaled_dot_product_attention(
27972796
ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
27982797
)
2799-
ip_attn_output = ip_attn_output.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
2800-
ip_attn_output = scale * ip_attn_output
2801-
ip_attn_output = ip_attn_output.to(ip_query.dtype)
2798+
current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
2799+
batch_size, -1, attn.heads * head_dim
2800+
)
2801+
current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
2802+
ip_attn_output += scale * current_ip_hidden_states
28022803

28032804
return hidden_states, encoder_hidden_states, ip_attn_output
28042805
else:

src/diffusers/models/embeddings.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,6 +2583,11 @@ def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[
25832583
super().__init__()
25842584
self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
25852585

2586+
@property
2587+
def num_ip_adapters(self) -> int:
2588+
"""Number of IP-Adapters loaded."""
2589+
return len(self.image_projection_layers)
2590+
25862591
def forward(self, image_embeds: List[torch.Tensor]):
25872592
projected_image_embeds = []
25882593

src/diffusers/pipelines/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@
261261
_import_structure["marigold"].extend(
262262
[
263263
"MarigoldDepthPipeline",
264+
"MarigoldIntrinsicsPipeline",
264265
"MarigoldNormalsPipeline",
265266
]
266267
)
@@ -603,6 +604,7 @@
603604
from .lumina2 import Lumina2Text2ImgPipeline
604605
from .marigold import (
605606
MarigoldDepthPipeline,
607+
MarigoldIntrinsicsPipeline,
606608
MarigoldNormalsPipeline,
607609
)
608610
from .mochi import MochiPipeline

src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
237237
"add_neg_time_ids",
238238
"mask",
239239
"masked_image_latents",
240+
"control_image",
240241
]
241242

242243
def __init__(
@@ -743,15 +744,15 @@ def check_inputs(
743744
if padding_mask_crop is not None:
744745
if not isinstance(image, PIL.Image.Image):
745746
raise ValueError(
746-
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
747+
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
747748
)
748749
if not isinstance(mask_image, PIL.Image.Image):
749750
raise ValueError(
750751
f"The mask image should be a PIL image when inpainting mask crop, but is of type"
751752
f" {type(mask_image)}."
752753
)
753754
if output_type != "pil":
754-
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
755+
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
755756

756757
if prompt_embeds is not None and pooled_prompt_embeds is None:
757758
raise ValueError(
@@ -1644,7 +1645,7 @@ def denoising_value_valid(dnv):
16441645
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
16451646
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
16461647
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1647-
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1648+
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
16481649
" `pipeline.unet` or your `mask_image` or `image` input."
16491650
)
16501651
elif num_channels_unet != 4:
@@ -1835,6 +1836,7 @@ def denoising_value_valid(dnv):
18351836
latents = callback_outputs.pop("latents", latents)
18361837
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
18371838
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1839+
control_image = callback_outputs.pop("control_image", control_image)
18381840

18391841
# call the callback, if provided
18401842
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):

0 commit comments

Comments
 (0)