Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 27 additions & 12 deletions src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,19 +446,34 @@ def _prepare_rotary_positional_embeddings(
p = self.transformer.config.patch_size
p_t = self.transformer.config.patch_size_t or 1

base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t
if p_t is None:
# CogVideoX 1.0
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p

grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
)
grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=num_frames,
)
else:
# CogVideoX 1.5
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t

freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=None,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
grid_type="slice",
max_size=(base_size_height, base_size_width),
)

freqs_cos = freqs_cos.to(device=device)
freqs_sin = freqs_sin.to(device=device)
Expand Down
39 changes: 27 additions & 12 deletions src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,19 +492,34 @@ def _prepare_rotary_positional_embeddings(
p = self.transformer.config.patch_size
p_t = self.transformer.config.patch_size_t or 1

base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t
if p_t is None:
# CogVideoX 1.0
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p

grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
)
grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=num_frames,
)
else:
# CogVideoX 1.5
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t

freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=None,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
grid_type="slice",
max_size=(base_size_height, base_size_width),
)

freqs_cos = freqs_cos.to(device=device)
freqs_sin = freqs_sin.to(device=device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ def unfuse_qkv_projections(self) -> None:
self.transformer.unfuse_qkv_projections()
self.fusing_transformer = False

# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
def _prepare_rotary_positional_embeddings(
self,
height: int,
Expand All @@ -539,10 +540,10 @@ def _prepare_rotary_positional_embeddings(
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)

p = self.transformer.config.patch_size
p_t = self.transformer.config.patch_size_t
p_t = self.transformer.config.patch_size_t or 1

if p_t is None:
# CogVideoX 1.0 I2V
# CogVideoX 1.0
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p

Expand All @@ -556,7 +557,7 @@ def _prepare_rotary_positional_embeddings(
temporal_size=num_frames,
)
else:
# CogVideoX 1.5 I2V
# CogVideoX 1.5
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t
Expand Down
39 changes: 27 additions & 12 deletions src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,19 +522,34 @@ def _prepare_rotary_positional_embeddings(
p = self.transformer.config.patch_size
p_t = self.transformer.config.patch_size_t or 1

base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t
if p_t is None:
# CogVideoX 1.0
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p

grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
)
grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=num_frames,
)
else:
# CogVideoX 1.5
base_size_width = self.transformer.config.sample_width // p
base_size_height = self.transformer.config.sample_height // p
base_num_frames = (num_frames + p_t - 1) // p_t

freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=None,
grid_size=(grid_height, grid_width),
temporal_size=base_num_frames,
grid_type="slice",
max_size=(base_size_height, base_size_width),
)

freqs_cos = freqs_cos.to(device=device)
freqs_sin = freqs_sin.to(device=device)
Expand Down
Loading