@@ -86,12 +86,21 @@ def get_3d_sincos_pos_embed(
8686 temporal_interpolation_scale : float = 1.0 ,
8787) -> np .ndarray :
8888 r"""
89+ 3D version of `get_2d_sincos_pos_embed()`, this function returns a 3D sinusoidal position embed.
90+
8991 Args:
9092 embed_dim (`int`):
93+ The embedding dimension.
9194 spatial_size (`int` or `Tuple[int, int]`):
95+ The spatial size of the embed.
9296 temporal_size (`int`):
97+ The temporal size of the embed.
9398 spatial_interpolation_scale (`float`, defaults to 1.0):
99+ The spatial interpolation scale of the embed.
94100 temporal_interpolation_scale (`float`, defaults to 1.0):
101+ The temporal interpolation scale of the embed.
102+ Returns:
103+ np.ndarray: The 3D position embedding.
95104 """
96105 if embed_dim % 4 != 0 :
97106 raise ValueError ("`embed_dim` must be divisible by 4" )
@@ -129,8 +138,24 @@ def get_2d_sincos_pos_embed(
129138 embed_dim , grid_size , cls_token = False , extra_tokens = 0 , interpolation_scale = 1.0 , base_size = 16
130139):
131140 """
132- grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
133- [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
141+ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal positional embeddings.
142+
143+ Args:
144+ embed_dim: int
145+ The embedding dimension.
146+ grid_size: int
147+ The size of the grid height and width.
148+ cls_token: bool
149+ Whether or not to add a classification token.
150+ extra_tokens: int
151+ The number of extra tokens to add.
152+ interpolation_scale: float
153+ The scale of the interpolation.
154+
155+ Returns:
156+ pos_embed: np.ndarray
157+ Shape is [grid_size*grid_size, embed_dim] or
158+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
134159 """
135160 if isinstance (grid_size , int ):
136161 grid_size = (grid_size , grid_size )
@@ -148,6 +173,15 @@ def get_2d_sincos_pos_embed(
148173
149174
150175def get_2d_sincos_pos_embed_from_grid (embed_dim , grid ):
176+ """
177+ This function generates 2D positional embeddings from a grid.
178+
179+ Args:
180+ embed_dim (`int`): output dimension for each position
181+ grid (`np.ndarray`): grid of positions
182+ Output:
183+ `np.ndarray`: tensor in shape (grid_size*grid_size, embed_dim)
184+ """
151185 if embed_dim % 2 != 0 :
152186 raise ValueError ("embed_dim must be divisible by 2" )
153187
@@ -161,7 +195,13 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
161195
162196def get_1d_sincos_pos_embed_from_grid (embed_dim , pos ):
163197 """
164- embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
198+ This function generates 1D positional embeddings from sin and cos values.
199+
200+ Args:
201+ embed_dim(`int`): output dimension for each position
202+ pos(`numpy.ndarray(dtype=float)`): tensor in shape (M, 1)
203+ Output:
204+ `numpy.ndarray(dtype=float)`: tensor in shape (M, D)
165205 """
166206 if embed_dim % 2 != 0 :
167207 raise ValueError ("embed_dim must be divisible by 2" )
0 commit comments