Skip to content

Commit a0f1de7

Browse files
authored
Merge branch 'main' into layerwise-upcasting-hook
2 parents 9372647 + aa79d7d commit a0f1de7

28 files changed

+443
-90
lines changed

docs/source/en/api/pipelines/flux.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ transformer_8bit = FluxTransformer2DModel.from_pretrained(
367367

368368
pipeline = FluxPipeline.from_pretrained(
369369
"black-forest-labs/FLUX.1-dev",
370-
text_encoder=text_encoder_8bit,
370+
text_encoder_2=text_encoder_8bit,
371371
transformer=transformer_8bit,
372372
torch_dtype=torch.float16,
373373
device_map="balanced",

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
[HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.
1818

19-
*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).*
19+
*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/tencent/HunyuanVideo).*
2020

2121
<Tip>
2222

@@ -45,14 +45,14 @@ from diffusers.utils import export_to_video
4545

4646
quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
4747
transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained(
48-
"tencent/HunyuanVideo",
48+
"hunyuanvideo-community/HunyuanVideo",
4949
subfolder="transformer",
5050
quantization_config=quant_config,
51-
torch_dtype=torch.float16,
51+
torch_dtype=torch.bfloat16,
5252
)
5353

5454
pipeline = HunyuanVideoPipeline.from_pretrained(
55-
"tencent/HunyuanVideo",
55+
"hunyuanvideo-community/HunyuanVideo",
5656
transformer=transformer_8bit,
5757
torch_dtype=torch.float16,
5858
device_map="balanced",

docs/source/en/using-diffusers/text-img2vid.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
7878
from diffusers.utils import export_to_video
7979

8080
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
81-
"tencent/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16
81+
"hunyuanvideo-community/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16
8282
)
8383
pipe = HunyuanVideoPipeline.from_pretrained(
84-
"tencent/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16
84+
"hunyuanvideo-community/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16
8585
)
8686

8787
# reduce memory requirements

examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ def load_model_hook(models, input_dir):
765765
lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
766766

767767
transformer_state_dict = {
768-
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
768+
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
769769
}
770770
transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
771771
incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
"transformers>=4.41.2",
136136
"urllib3<=2.0.0",
137137
"black",
138+
"phonemizer",
138139
]
139140

140141
# this is a lookup table with items like:
@@ -227,6 +228,7 @@ def run(self):
227228
"scipy",
228229
"torchvision",
229230
"transformers",
231+
"phonemizer",
230232
)
231233
extras["torch"] = deps_list("torch", "accelerate")
232234

src/diffusers/dependency_versions_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@
4343
"transformers": "transformers>=4.41.2",
4444
"urllib3": "urllib3<=2.0.0",
4545
"black": "black",
46+
"phonemizer": "phonemizer",
4647
}

src/diffusers/loaders/peft.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -300,15 +300,17 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
300300
try:
301301
inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
302302
incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
303-
except RuntimeError as e:
304-
for module in self.modules():
305-
if isinstance(module, BaseTunerLayer):
306-
active_adapters = module.active_adapters
307-
for active_adapter in active_adapters:
308-
if adapter_name in active_adapter:
309-
module.delete_adapter(adapter_name)
310-
311-
self.peft_config.pop(adapter_name)
303+
except Exception as e:
304+
# In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`.
305+
if hasattr(self, "peft_config"):
306+
for module in self.modules():
307+
if isinstance(module, BaseTunerLayer):
308+
active_adapters = module.active_adapters
309+
for active_adapter in active_adapters:
310+
if adapter_name in active_adapter:
311+
module.delete_adapter(adapter_name)
312+
313+
self.peft_config.pop(adapter_name)
312314
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
313315
raise
314316

src/diffusers/loaders/single_file_utils.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@
186186
"inpainting": 512,
187187
"inpainting_v2": 512,
188188
"controlnet": 512,
189+
"instruct-pix2pix": 512,
189190
"v2": 768,
190191
"v1": 512,
191192
}
@@ -605,10 +606,14 @@ def infer_diffusers_model_type(checkpoint):
605606
if any(
606607
g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
607608
):
608-
if checkpoint["img_in.weight"].shape[1] == 384:
609-
model_type = "flux-fill"
609+
if "model.diffusion_model.img_in.weight" in checkpoint:
610+
key = "model.diffusion_model.img_in.weight"
611+
else:
612+
key = "img_in.weight"
610613

611-
elif checkpoint["img_in.weight"].shape[1] == 128:
614+
if checkpoint[key].shape[1] == 384:
615+
model_type = "flux-fill"
616+
elif checkpoint[key].shape[1] == 128:
612617
model_type = "flux-depth"
613618
else:
614619
model_type = "flux-dev"

src/diffusers/models/autoencoders/autoencoder_kl_ltx.py

Lines changed: 96 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,10 +1010,12 @@ def __init__(
10101010
# The minimal tile height and width for spatial tiling to be used
10111011
self.tile_sample_min_height = 512
10121012
self.tile_sample_min_width = 512
1013+
self.tile_sample_min_num_frames = 16
10131014

10141015
# The minimal distance between two spatial tiles
10151016
self.tile_sample_stride_height = 448
10161017
self.tile_sample_stride_width = 448
1018+
self.tile_sample_stride_num_frames = 8
10171019

10181020
def _set_gradient_checkpointing(self, module, value=False):
10191021
if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
@@ -1023,8 +1025,10 @@ def enable_tiling(
10231025
self,
10241026
tile_sample_min_height: Optional[int] = None,
10251027
tile_sample_min_width: Optional[int] = None,
1028+
tile_sample_min_num_frames: Optional[int] = None,
10261029
tile_sample_stride_height: Optional[float] = None,
10271030
tile_sample_stride_width: Optional[float] = None,
1031+
tile_sample_stride_num_frames: Optional[float] = None,
10281032
) -> None:
10291033
r"""
10301034
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -1046,8 +1050,10 @@ def enable_tiling(
10461050
self.use_tiling = True
10471051
self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
10481052
self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
1053+
self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames
10491054
self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
10501055
self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
1056+
self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames
10511057

10521058
def disable_tiling(self) -> None:
10531059
r"""
@@ -1073,18 +1079,13 @@ def disable_slicing(self) -> None:
10731079
def _encode(self, x: torch.Tensor) -> torch.Tensor:
10741080
batch_size, num_channels, num_frames, height, width = x.shape
10751081

1082+
if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames:
1083+
return self._temporal_tiled_encode(x)
1084+
10761085
if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
10771086
return self.tiled_encode(x)
10781087

1079-
if self.use_framewise_encoding:
1080-
# TODO(aryan): requires investigation
1081-
raise NotImplementedError(
1082-
"Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1083-
"quality issues caused by splitting inference across frame dimension. If you believe this "
1084-
"should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1085-
)
1086-
else:
1087-
enc = self.encoder(x)
1088+
enc = self.encoder(x)
10881089

10891090
return enc
10901091

@@ -1121,19 +1122,15 @@ def _decode(
11211122
batch_size, num_channels, num_frames, height, width = z.shape
11221123
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
11231124
tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
1125+
tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
1126+
1127+
if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
1128+
return self._temporal_tiled_decode(z, temb, return_dict=return_dict)
11241129

11251130
if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
11261131
return self.tiled_decode(z, temb, return_dict=return_dict)
11271132

1128-
if self.use_framewise_decoding:
1129-
# TODO(aryan): requires investigation
1130-
raise NotImplementedError(
1131-
"Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1132-
"quality issues caused by splitting inference across frame dimension. If you believe this "
1133-
"should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1134-
)
1135-
else:
1136-
dec = self.decoder(z, temb)
1133+
dec = self.decoder(z, temb)
11371134

11381135
if not return_dict:
11391136
return (dec,)
@@ -1189,6 +1186,14 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
11891186
)
11901187
return b
11911188

1189+
def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
1190+
blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
1191+
for x in range(blend_extent):
1192+
b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
1193+
x / blend_extent
1194+
)
1195+
return b
1196+
11921197
def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
11931198
r"""Encode a batch of images using a tiled encoder.
11941199
@@ -1217,17 +1222,9 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
12171222
for i in range(0, height, self.tile_sample_stride_height):
12181223
row = []
12191224
for j in range(0, width, self.tile_sample_stride_width):
1220-
if self.use_framewise_encoding:
1221-
# TODO(aryan): requires investigation
1222-
raise NotImplementedError(
1223-
"Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1224-
"quality issues caused by splitting inference across frame dimension. If you believe this "
1225-
"should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1226-
)
1227-
else:
1228-
time = self.encoder(
1229-
x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
1230-
)
1225+
time = self.encoder(
1226+
x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
1227+
)
12311228

12321229
row.append(time)
12331230
rows.append(row)
@@ -1283,17 +1280,7 @@ def tiled_decode(
12831280
for i in range(0, height, tile_latent_stride_height):
12841281
row = []
12851282
for j in range(0, width, tile_latent_stride_width):
1286-
if self.use_framewise_decoding:
1287-
# TODO(aryan): requires investigation
1288-
raise NotImplementedError(
1289-
"Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1290-
"quality issues caused by splitting inference across frame dimension. If you believe this "
1291-
"should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1292-
)
1293-
else:
1294-
time = self.decoder(
1295-
z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
1296-
)
1283+
time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb)
12971284

12981285
row.append(time)
12991286
rows.append(row)
@@ -1318,6 +1305,74 @@ def tiled_decode(
13181305

13191306
return DecoderOutput(sample=dec)
13201307

1308+
def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
1309+
batch_size, num_channels, num_frames, height, width = x.shape
1310+
latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1
1311+
1312+
tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
1313+
tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
1314+
blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames
1315+
1316+
row = []
1317+
for i in range(0, num_frames, self.tile_sample_stride_num_frames):
1318+
tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :]
1319+
if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width):
1320+
tile = self.tiled_encode(tile)
1321+
else:
1322+
tile = self.encoder(tile)
1323+
if i > 0:
1324+
tile = tile[:, :, 1:, :, :]
1325+
row.append(tile)
1326+
1327+
result_row = []
1328+
for i, tile in enumerate(row):
1329+
if i > 0:
1330+
tile = self.blend_t(row[i - 1], tile, blend_num_frames)
1331+
result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :])
1332+
else:
1333+
result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :])
1334+
1335+
enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
1336+
return enc
1337+
1338+
def _temporal_tiled_decode(
1339+
self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True
1340+
) -> Union[DecoderOutput, torch.Tensor]:
1341+
batch_size, num_channels, num_frames, height, width = z.shape
1342+
num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
1343+
1344+
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1345+
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
1346+
tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
1347+
tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
1348+
blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
1349+
1350+
row = []
1351+
for i in range(0, num_frames, tile_latent_stride_num_frames):
1352+
tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
1353+
if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
1354+
decoded = self.tiled_decode(tile, temb, return_dict=True).sample
1355+
else:
1356+
decoded = self.decoder(tile, temb)
1357+
if i > 0:
1358+
decoded = decoded[:, :, :-1, :, :]
1359+
row.append(decoded)
1360+
1361+
result_row = []
1362+
for i, tile in enumerate(row):
1363+
if i > 0:
1364+
tile = self.blend_t(row[i - 1], tile, blend_num_frames)
1365+
tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
1366+
result_row.append(tile)
1367+
else:
1368+
result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :])
1369+
1370+
dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
1371+
1372+
if not return_dict:
1373+
return (dec,)
1374+
return DecoderOutput(sample=dec)
1375+
13211376
def forward(
13221377
self,
13231378
sample: torch.Tensor,
@@ -1334,5 +1389,5 @@ def forward(
13341389
z = posterior.mode()
13351390
dec = self.decode(z, temb)
13361391
if not return_dict:
1337-
return (dec,)
1392+
return (dec.sample,)
13381393
return dec

0 commit comments

Comments
 (0)