Skip to content

Commit b3a70b5

Browse files
authored
Merge branch 'main' into animemory
2 parents 8f51743 + 63b631f commit b3a70b5

File tree

21 files changed

+1500
-14
lines changed

21 files changed

+1500
-14
lines changed

docs/source/en/api/pipelines/pag.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
9696
- all
9797
- __call__
9898

99+
## StableDiffusion3PAGImg2ImgPipeline
100+
[[autodoc]] StableDiffusion3PAGImg2ImgPipeline
101+
- all
102+
- __call__
99103

100104
## PixArtSigmaPAGPipeline
101105
[[autodoc]] PixArtSigmaPAGPipeline

docs/source/en/tutorials/basic_training.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ For convenience, create a `TrainingConfig` class containing the training hyperpa
7575

7676
... push_to_hub = True # whether to upload the saved model to the HF Hub
7777
... hub_model_id = "<your-username>/<my-awesome-model>" # the name of the repository to create on the HF Hub
78-
... hub_private_repo = False
78+
... hub_private_repo = None
7979
... overwrite_output_dir = True # overwrite the old model when re-running the notebook
8080
... seed = 0
8181

docs/source/ko/tutorials/basic_training.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ huggingface-cli login
7676
... output_dir = "ddpm-butterflies-128" # 로컬 및 HF Hub에 저장되는 모델명
7777

7878
... push_to_hub = True # 저장된 모델을 HF Hub에 업로드할지 여부
79-
... hub_private_repo = False
79+
... hub_private_repo = None
8080
... overwrite_output_dir = True # 노트북을 다시 실행할 때 이전 모델에 덮어씌울지
8181
... seed = 0
8282

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@
341341
"StableDiffusion3Img2ImgPipeline",
342342
"StableDiffusion3InpaintPipeline",
343343
"StableDiffusion3PAGPipeline",
344+
"StableDiffusion3PAGImg2ImgPipeline",
344345
"StableDiffusion3Pipeline",
345346
"StableDiffusionAdapterPipeline",
346347
"StableDiffusionAttendAndExcitePipeline",
@@ -810,6 +811,7 @@
810811
StableDiffusion3ControlNetPipeline,
811812
StableDiffusion3Img2ImgPipeline,
812813
StableDiffusion3InpaintPipeline,
814+
StableDiffusion3PAGImg2ImgPipeline,
813815
StableDiffusion3PAGPipeline,
814816
StableDiffusion3Pipeline,
815817
StableDiffusionAdapterPipeline,

src/diffusers/configuration_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
170170

171171
if push_to_hub:
172172
commit_message = kwargs.pop("commit_message", None)
173-
private = kwargs.pop("private", False)
173+
private = kwargs.pop("private", None)
174174
create_pr = kwargs.pop("create_pr", False)
175175
token = kwargs.pop("token", None)
176176
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])

src/diffusers/models/attention_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,6 +1171,7 @@ def __call__(
11711171
attn: Attention,
11721172
hidden_states: torch.FloatTensor,
11731173
encoder_hidden_states: torch.FloatTensor = None,
1174+
attention_mask: Optional[torch.FloatTensor] = None,
11741175
) -> torch.FloatTensor:
11751176
residual = hidden_states
11761177

src/diffusers/models/autoencoders/autoencoder_kl.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import torch.nn as nn
1818

1919
from ...configuration_utils import ConfigMixin, register_to_config
20+
from ...loaders import PeftAdapterMixin
2021
from ...loaders.single_file_model import FromOriginalModelMixin
2122
from ...utils import deprecate
2223
from ...utils.accelerate_utils import apply_forward_hook
@@ -34,7 +35,7 @@
3435
from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
3536

3637

37-
class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
38+
class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
3839
r"""
3940
A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
4041

src/diffusers/models/embeddings.py

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,25 @@ def get_3d_sincos_pos_embed(
8686
temporal_interpolation_scale: float = 1.0,
8787
) -> np.ndarray:
8888
r"""
89+
Creates 3D sinusoidal positional embeddings.
90+
8991
Args:
9092
embed_dim (`int`):
93+
The embedding dimension of inputs. It must be divisible by 16.
9194
spatial_size (`int` or `Tuple[int, int]`):
95+
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
96+
spatial dimensions (height and width).
9297
temporal_size (`int`):
98+
The temporal dimension of postional embeddings (number of frames).
9399
spatial_interpolation_scale (`float`, defaults to 1.0):
100+
Scale factor for spatial grid interpolation.
94101
temporal_interpolation_scale (`float`, defaults to 1.0):
102+
Scale factor for temporal grid interpolation.
103+
104+
Returns:
105+
`np.ndarray`:
106+
The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1],
107+
embed_dim]`.
95108
"""
96109
if embed_dim % 4 != 0:
97110
raise ValueError("`embed_dim` must be divisible by 4")
@@ -129,8 +142,24 @@ def get_2d_sincos_pos_embed(
129142
embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
130143
):
131144
"""
132-
grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
133-
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
145+
Creates 2D sinusoidal positional embeddings.
146+
147+
Args:
148+
embed_dim (`int`):
149+
The embedding dimension.
150+
grid_size (`int`):
151+
The size of the grid height and width.
152+
cls_token (`bool`, defaults to `False`):
153+
Whether or not to add a classification token.
154+
extra_tokens (`int`, defaults to `0`):
155+
The number of extra tokens to add.
156+
interpolation_scale (`float`, defaults to `1.0`):
157+
The scale of the interpolation.
158+
159+
Returns:
160+
pos_embed (`np.ndarray`):
161+
Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
162+
embed_dim]` if using cls_token
134163
"""
135164
if isinstance(grid_size, int):
136165
grid_size = (grid_size, grid_size)
@@ -148,6 +177,16 @@ def get_2d_sincos_pos_embed(
148177

149178

150179
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
180+
r"""
181+
This function generates 2D sinusoidal positional embeddings from a grid.
182+
183+
Args:
184+
embed_dim (`int`): The embedding dimension.
185+
grid (`np.ndarray`): Grid of positions with shape `(H * W,)`.
186+
187+
Returns:
188+
`np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
189+
"""
151190
if embed_dim % 2 != 0:
152191
raise ValueError("embed_dim must be divisible by 2")
153192

@@ -161,7 +200,14 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
161200

162201
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
163202
"""
164-
embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
203+
This function generates 1D positional embeddings from a grid.
204+
205+
Args:
206+
embed_dim (`int`): The embedding dimension `D`
207+
pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)`
208+
209+
Returns:
210+
`numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`.
165211
"""
166212
if embed_dim % 2 != 0:
167213
raise ValueError("embed_dim must be divisible by 2")
@@ -181,7 +227,22 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
181227

182228

183229
class PatchEmbed(nn.Module):
184-
"""2D Image to Patch Embedding with support for SD3 cropping."""
230+
"""
231+
2D Image to Patch Embedding with support for SD3 cropping.
232+
233+
Args:
234+
height (`int`, defaults to `224`): The height of the image.
235+
width (`int`, defaults to `224`): The width of the image.
236+
patch_size (`int`, defaults to `16`): The size of the patches.
237+
in_channels (`int`, defaults to `3`): The number of input channels.
238+
embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
239+
layer_norm (`bool`, defaults to `False`): Whether or not to use layer normalization.
240+
flatten (`bool`, defaults to `True`): Whether or not to flatten the output.
241+
bias (`bool`, defaults to `True`): Whether or not to use bias.
242+
interpolation_scale (`float`, defaults to `1`): The scale of the interpolation.
243+
pos_embed_type (`str`, defaults to `"sincos"`): The type of positional embedding.
244+
pos_embed_max_size (`int`, defaults to `None`): The maximum size of the positional embedding.
245+
"""
185246

186247
def __init__(
187248
self,
@@ -289,7 +350,15 @@ def forward(self, latent):
289350

290351

291352
class LuminaPatchEmbed(nn.Module):
292-
"""2D Image to Patch Embedding with support for Lumina-T2X"""
353+
"""
354+
2D Image to Patch Embedding with support for Lumina-T2X
355+
356+
Args:
357+
patch_size (`int`, defaults to `2`): The size of the patches.
358+
in_channels (`int`, defaults to `4`): The number of input channels.
359+
embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
360+
bias (`bool`, defaults to `True`): Whether or not to use bias.
361+
"""
293362

294363
def __init__(self, patch_size=2, in_channels=4, embed_dim=768, bias=True):
295364
super().__init__()
@@ -675,6 +744,20 @@ def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True):
675744

676745

677746
def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
747+
"""
748+
Get 2D RoPE from grid.
749+
750+
Args:
751+
embed_dim: (`int`):
752+
The embedding dimension size, corresponding to hidden_size_head.
753+
grid (`np.ndarray`):
754+
The grid of the positional embedding.
755+
use_real (`bool`):
756+
If True, return real part and imaginary part separately. Otherwise, return complex numbers.
757+
758+
Returns:
759+
`torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
760+
"""
678761
assert embed_dim % 4 == 0
679762

680763
# use half of dimensions to encode grid_h
@@ -695,6 +778,23 @@ def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
695778

696779

697780
def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, ntk_factor=1.0):
781+
"""
782+
Get 2D RoPE from grid.
783+
784+
Args:
785+
embed_dim: (`int`):
786+
The embedding dimension size, corresponding to hidden_size_head.
787+
grid (`np.ndarray`):
788+
The grid of the positional embedding.
789+
linear_factor (`float`):
790+
The linear factor of the positional embedding, which is used to scale the positional embedding in the linear
791+
layer.
792+
ntk_factor (`float`):
793+
The ntk factor of the positional embedding, which is used to scale the positional embedding in the ntk layer.
794+
795+
Returns:
796+
`torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
797+
"""
698798
assert embed_dim % 4 == 0
699799

700800
emb_h = get_1d_rotary_pos_embed(

src/diffusers/models/modeling_flax_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@ def save_pretrained(
530530

531531
if push_to_hub:
532532
commit_message = kwargs.pop("commit_message", None)
533-
private = kwargs.pop("private", False)
533+
private = kwargs.pop("private", None)
534534
create_pr = kwargs.pop("create_pr", False)
535535
token = kwargs.pop("token", None)
536536
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])

src/diffusers/models/modeling_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def save_pretrained(
338338

339339
if push_to_hub:
340340
commit_message = kwargs.pop("commit_message", None)
341-
private = kwargs.pop("private", False)
341+
private = kwargs.pop("private", None)
342342
create_pr = kwargs.pop("create_pr", False)
343343
token = kwargs.pop("token", None)
344344
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])

0 commit comments

Comments
 (0)