Skip to content

Commit 2ed4d01

Browse files
committed
Update hunyuan video t2v preprocess
1 parent 3b39366 commit 2ed4d01

File tree

7 files changed

+69
-16
lines changed

7 files changed

+69
-16
lines changed
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22

33
GPU_NUM=1 # 2,4,8
44
MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
5-
DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
6-
OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
5+
DATASET_PATH="data/crush-smol"
6+
OUTPUT_DIR="data/crush-smol_processed_t2v_hunyuan/"
77

88
torchrun --nproc_per_node=$GPU_NUM \
99
-m fastvideo.pipelines.preprocess.v1_preprocessing_new \
1010
--model_path $MODEL_PATH \
1111
--mode preprocess \
1212
--workload_type t2v \
13+
--preprocess.dataset_type merged \
1314
--preprocess.dataset_path $DATASET_PATH \
1415
--preprocess.dataset_output_dir $OUTPUT_DIR \
1516
--preprocess.preprocess_video_batch_size 2 \
@@ -21,3 +22,4 @@ torchrun --nproc_per_node=$GPU_NUM \
2122
--preprocess.samples_per_file 8 \
2223
--preprocess.flush_frequency 8 \
2324
--preprocess.video_length_tolerance_range 5
25+

fastvideo/configs/models/encoders/clip.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,14 @@ class CLIPVisionArchConfig(ImageEncoderArchConfig):
7474
class CLIPTextConfig(TextEncoderConfig):
7575
arch_config: TextEncoderArchConfig = field(
7676
default_factory=CLIPTextArchConfig)
77-
77+
tokenizer_kwargs: dict = field(
78+
default_factory=lambda: {
79+
"padding": "max_length",
80+
"truncation": True,
81+
"max_length": 77,
82+
"return_tensors": "pt"
83+
}
84+
)
7885
num_hidden_layers_override: int | None = None
7986
require_post_norm: bool | None = None
8087
prefix: str = "clip"

fastvideo/configs/models/encoders/llama.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,11 @@ class LlamaArchConfig(TextEncoderArchConfig):
6060
@dataclass
6161
class LlamaConfig(TextEncoderConfig):
6262
arch_config: TextEncoderArchConfig = field(default_factory=LlamaArchConfig)
63-
63+
tokenizer_kwargs: dict = field(
64+
default_factory=lambda: {
65+
"padding": "max_length",
66+
"truncation": True,
67+
"max_length": 256,
68+
"return_tensors": "pt"
69+
})
6470
prefix: str = "llama"

fastvideo/layers/rotary_embedding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,14 @@ def forward_native(
138138
cos, sin = cos_sin.chunk(2, dim=-1)
139139

140140
query_shape = query.shape
141-
query = query.view(num_tokens, -1, self.head_size)
141+
query = query.reshape(num_tokens, -1, self.head_size)
142142
query_rot = query[..., :self.rotary_dim]
143143
query_pass = query[..., self.rotary_dim:]
144144
query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
145145
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
146146

147147
key_shape = key.shape
148-
key = key.view(num_tokens, -1, self.head_size)
148+
key = key.reshape(num_tokens, -1, self.head_size)
149149
key_rot = key[..., :self.rotary_dim]
150150
key_pass = key[..., self.rotary_dim:]
151151
key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)

fastvideo/models/vaes/hunyuanvae.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def forward(self,
9191
key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
9292
value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
9393

94-
# Perform scaled dot-product attention
94+
# Perform scaled dot-product attentionz
9595
hidden_states = F.scaled_dot_product_attention(query,
9696
key,
9797
value,
@@ -361,7 +361,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
361361
hidden_states.device,
362362
batch_size=batch_size)
363363
hidden_states = attn(hidden_states,
364-
attention_mask=attention_mask)
364+
attention_mask=attention_mask.unsqueeze(1))
365365
hidden_states = hidden_states.unflatten(
366366
1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
367367

@@ -385,7 +385,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
385385
hidden_states.device,
386386
batch_size=batch_size)
387387
hidden_states = attn(hidden_states,
388-
attention_mask=attention_mask)
388+
attention_mask=attention_mask.unsqueeze(1))
389389
hidden_states = hidden_states.unflatten(
390390
1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
391391

fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99

1010
class PreprocessPipelineI2V(ComposedPipelineBase):
1111
_required_config_modules = [
12-
"image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
12+
"image_encoder",
13+
"image_processor",
14+
"text_encoder",
15+
"tokenizer",
16+
"text_encoder_2",
17+
"tokenizer_2",
18+
"vae"
1319
]
1420

1521
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
@@ -51,8 +57,13 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
5157

5258

5359
class PreprocessPipelineT2V(ComposedPipelineBase):
54-
_required_config_modules = ["text_encoder", "tokenizer", "vae"]
55-
60+
_required_config_modules = [
61+
"text_encoder",
62+
"tokenizer",
63+
"text_encoder_2",
64+
"tokenizer_2",
65+
"vae"
66+
]
5667
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
5768
assert fastvideo_args.preprocess_config is not None
5869
self.add_stage(stage_name="text_transform_stage",
@@ -61,10 +72,34 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
6172
preprocess_config.training_cfg_rate,
6273
seed=fastvideo_args.preprocess_config.seed,
6374
))
75+
# llama_tokenizer_kwargs = {
76+
# "padding": "max_length",
77+
# "truncation": True,
78+
# "max_length": 256,
79+
# "return_tensors": "pt"
80+
# }
81+
# clip_tokenizer_kwargs = {
82+
# "padding": "max_length",
83+
# "truncation": True,
84+
# "max_length": 77,
85+
# "return_tensors": "pt"
86+
# }
87+
# if len(fastvideo_args.pipeline_config.text_encoder_configs) >= 2:
88+
# fastvideo_args.pipeline_config.text_encoder_configs[0].tokenizer_kwargs = llama_tokenizer_kwargs
89+
# fastvideo_args.pipeline_config.text_encoder_configs[1].tokenizer_kwargs = clip_tokenizer_kwargs
90+
text_encoders = [
91+
self.get_module("text_encoder"),
92+
self.get_module("text_encoder_2")
93+
]
94+
tokenizers = [
95+
self.get_module("tokenizer"),
96+
self.get_module("tokenizer_2")
97+
]
98+
6499
self.add_stage(stage_name="prompt_encoding_stage",
65100
stage=TextEncodingStage(
66-
text_encoders=[self.get_module("text_encoder")],
67-
tokenizers=[self.get_module("tokenizer")],
101+
text_encoders=text_encoders,
102+
tokenizers=tokenizers,
68103
))
69104
self.add_stage(
70105
stage_name="video_transform_stage",

scripts/dataset_preparation/prepare_json_file.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ def get_video_info(video_path):
2323
fps = info.get("video_fps", 0)
2424
duration = num_frames / fps if fps > 0 else 0
2525

26-
# Extract name
27-
_, _, videos_dir, video_name = str(video_path).split("/")
26+
from pathlib import Path
27+
video_path = Path(video_path)
28+
videos_dir = video_path.parent.name
29+
video_name = video_path.name
2830

2931
return {
3032
"path": str(video_name),
@@ -100,6 +102,7 @@ def prepare_dataset_json(folder_path,
100102

101103
# Save to JSON file
102104
output_file = folder_path / output_name
105+
print(folder_path,output_file,output_name)
103106
with open(output_file, 'w') as f:
104107
json.dump(dataset_info, f, indent=2)
105108

0 commit comments

Comments
 (0)