Skip to content

Commit c4cbe65

Browse files
committed
Fix some documentation in ./src/diffusers/models/embeddings.py as demonstration.
1 parent 99f6082 commit c4cbe65

File tree

1 file changed

+43
-3
lines changed

1 file changed

+43
-3
lines changed

src/diffusers/models/embeddings.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,21 @@ def get_3d_sincos_pos_embed(
8686
temporal_interpolation_scale: float = 1.0,
8787
) -> np.ndarray:
8888
r"""
89+
3D version of `get_2d_sincos_pos_embed()`, this function returns a 3D sinusoidal position embed.
90+
8991
Args:
9092
embed_dim (`int`):
93+
The embedding dimension.
9194
spatial_size (`int` or `Tuple[int, int]`):
95+
The spatial size of the embed.
9296
temporal_size (`int`):
97+
The temporal size of the embed.
9398
spatial_interpolation_scale (`float`, defaults to 1.0):
99+
The spatial interpolation scale of the embed.
94100
temporal_interpolation_scale (`float`, defaults to 1.0):
101+
The temporal interpolation scale of the embed.
102+
Returns:
103+
np.ndarray: The 3D position embedding.
95104
"""
96105
if embed_dim % 4 != 0:
97106
raise ValueError("`embed_dim` must be divisible by 4")
@@ -129,8 +138,24 @@ def get_2d_sincos_pos_embed(
129138
embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
130139
):
131140
"""
132-
grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
133-
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
141+
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal positional embeddings.
142+
143+
Args:
144+
embed_dim: int
145+
The embedding dimension.
146+
grid_size: int
147+
The size of the grid height and width.
148+
cls_token: bool
149+
Whether or not to add a classification token.
150+
extra_tokens: int
151+
The number of extra tokens to add.
152+
interpolation_scale: float
153+
The scale of the interpolation.
154+
155+
Returns:
156+
pos_embed: np.ndarray
157+
Shape is [grid_size*grid_size, embed_dim] or
158+
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
134159
"""
135160
if isinstance(grid_size, int):
136161
grid_size = (grid_size, grid_size)
@@ -148,6 +173,15 @@ def get_2d_sincos_pos_embed(
148173

149174

150175
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
176+
"""
177+
This function generates 2D positional embeddings from a grid.
178+
179+
Args:
180+
embed_dim (`int`): output dimension for each position
181+
grid (`np.ndarray`): grid of positions
182+
Output:
183+
`np.ndarray`: tensor in shape (grid_size*grid_size, embed_dim)
184+
"""
151185
if embed_dim % 2 != 0:
152186
raise ValueError("embed_dim must be divisible by 2")
153187

@@ -161,7 +195,13 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
161195

162196
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
163197
"""
164-
embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
198+
This function generates 1D positional embeddings from sin and cos values.
199+
200+
Args:
201+
embed_dim(`int`): output dimension for each position
202+
pos(`numpy.ndarray(dtype=float)`): tensor in shape (M, 1)
203+
Output:
204+
`numpy.ndarray(dtype=float)`: tensor in shape (M, D)
165205
"""
166206
if embed_dim % 2 != 0:
167207
raise ValueError("embed_dim must be divisible by 2")

0 commit comments

Comments
 (0)