From 710254affc02efa790fd9b881f7a0be5d5da8e77 Mon Sep 17 00:00:00 2001 From: ozbayb <17261091+ozbayb@users.noreply.github.com> Date: Fri, 26 Sep 2025 07:53:08 -0600 Subject: [PATCH 1/4] Allow HuMo to work with embedded image for I2V --- comfy/ldm/wan/model.py | 10 ++++++++-- comfy/model_base.py | 2 +- comfy/supported_models.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 54616e6eb6e5..dc0196974884 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1510,7 +1510,7 @@ def __init__(self, operations=None, ): - super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) + super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=36, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) self.audio_proj = AudioProjModel(seq_len=8, blocks=5, channels=1280, intermediate_dim=512, output_dim=1536, context_tokens=audio_token_num, dtype=dtype, device=device, operations=operations) @@ -1539,6 +1539,12 @@ def forward_orig( e0 = self.time_projection(e).unflatten(2, (6, self.dim)) if reference_latent is not None: + if reference_latent.shape[1] < 36: + padding_needed = 36 - reference_latent.shape[1] + padding = torch.zeros(reference_latent.shape[0], padding_needed, *reference_latent.shape[2:], + device=reference_latent.device, dtype=reference_latent.dtype) + reference_latent = torch.cat([padding, reference_latent], dim=1) # pad at beginning like c_concat + ref = self.patch_embedding(reference_latent.float()).to(x.dtype) ref = ref.flatten(2).transpose(1, 2) freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=time, device=x.device, dtype=x.dtype) @@ -1548,7 +1554,7 @@ def forward_orig( # context context = self.text_embedding(context) - context_img_len = None + context_img_len = 0 if audio_embed is not None: if reference_latent is not None: diff --git a/comfy/model_base.py b/comfy/model_base.py index b0b9cde7d087..dc4cd35dc1e7 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1227,7 +1227,7 @@ def extra_conds(self, **kwargs): if audio_embed is not None: out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) - if "c_concat" not in out: # 1.7B model + if "c_concat" not in out or "concat_latent_image" in kwargs: # 1.7B model OR I2V mode reference_latents = kwargs.get("reference_latents", None) if reference_latents is not None: out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 4064bdae1052..399ea8b582b4 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1080,7 +1080,7 @@ class WAN21_HuMo(WAN21_T2V): } def get_model(self, state_dict, prefix="", device=None): - out = model_base.WAN21_HuMo(self, image_to_video=False, device=device) + out = model_base.WAN21_HuMo(self, image_to_video=True, device=device) return out class WAN22_S2V(WAN21_T2V): From 9aa11f6f1a2dc232df5d7e4ba99a6207c17bf592 Mon Sep 17 00:00:00 2001 From: ozbayb <17261091+ozbayb@users.noreply.github.com> Date: Fri, 26 Sep 2025 12:15:19 -0600 Subject: [PATCH 2/4] Fix proper handling of difference between 1.7B and 14B HuMo models --- comfy/ldm/wan/model.py | 8 +------- comfy/model_base.py | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index dc0196974884..66017ed76a04 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1510,7 +1510,7 @@ def __init__(self, operations=None, ): - super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=36, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) + super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) self.audio_proj = AudioProjModel(seq_len=8, blocks=5, channels=1280, intermediate_dim=512, output_dim=1536, context_tokens=audio_token_num, dtype=dtype, device=device, operations=operations) @@ -1539,12 +1539,6 @@ def forward_orig( e0 = self.time_projection(e).unflatten(2, (6, self.dim)) if reference_latent is not None: - if reference_latent.shape[1] < 36: - padding_needed = 36 - reference_latent.shape[1] - padding = torch.zeros(reference_latent.shape[0], padding_needed, *reference_latent.shape[2:], - device=reference_latent.device, dtype=reference_latent.dtype) - reference_latent = torch.cat([padding, reference_latent], dim=1) # pad at beginning like c_concat - ref = self.patch_embedding(reference_latent.float()).to(x.dtype) ref = ref.flatten(2).transpose(1, 2) freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=time, device=x.device, dtype=x.dtype) diff --git a/comfy/model_base.py b/comfy/model_base.py index dc4cd35dc1e7..40b7b375723f 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1227,22 +1227,23 @@ def extra_conds(self, **kwargs): if audio_embed is not None: out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) - if "c_concat" not in out or "concat_latent_image" in kwargs: # 1.7B model OR I2V mode - reference_latents = kwargs.get("reference_latents", None) - if reference_latents is not None: - out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + reference_latents = kwargs.get("reference_latents", None) + + if "c_concat" not in out and reference_latents is not None and reference_latents[0].shape[1] == 16: # 1.7B model + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) else: - noise_shape = list(noise.shape) - noise_shape[1] += 4 - concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype) - zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1) - zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1) - zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1) - concat_latent[:, 4:] = zero_vae_values - concat_latent[:, 4:, :1] = zero_vae_values_first - concat_latent[:, 4:, 1:2] = zero_vae_values_second - out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent) - reference_latents = kwargs.get("reference_latents", None) + concat_latent_image = kwargs.get("concat_latent_image", None) + if concat_latent_image is None: + noise_shape = list(noise.shape) + noise_shape[1] += 4 + concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype) + zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1) + zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1) + zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1) + concat_latent[:, 4:] = zero_vae_values + concat_latent[:, 4:, :1] = zero_vae_values_first + concat_latent[:, 4:, 1:2] = zero_vae_values_second + out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent) if reference_latents is not None: ref_latent = self.process_latent_in(reference_latents[-1]) ref_latent_shape = list(ref_latent.shape) From 3c631aad1a3e1d470aa3f95aab1c25b8e9f17420 Mon Sep 17 00:00:00 2001 From: ozbayb <17261091+ozbayb@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:19:26 -0600 Subject: [PATCH 3/4] Revert unneeded changes --- comfy/ldm/wan/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 66017ed76a04..54616e6eb6e5 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1510,7 +1510,7 @@ def __init__(self, operations=None, ): - super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) + super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) self.audio_proj = AudioProjModel(seq_len=8, blocks=5, channels=1280, intermediate_dim=512, output_dim=1536, context_tokens=audio_token_num, dtype=dtype, device=device, operations=operations) @@ -1548,7 +1548,7 @@ def forward_orig( # context context = self.text_embedding(context) - context_img_len = 0 + context_img_len = None if audio_embed is not None: if reference_latent is not None: From 701de1918f286cd52877a38365ed01ba1ca3768f Mon Sep 17 00:00:00 2001 From: ozbayb <17261091+ozbayb@users.noreply.github.com> Date: Sat, 27 Sep 2025 17:07:49 -0600 Subject: [PATCH 4/4] Revert another unneeded change --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 399ea8b582b4..4064bdae1052 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1080,7 +1080,7 @@ class WAN21_HuMo(WAN21_T2V): } def get_model(self, state_dict, prefix="", device=None): - out = model_base.WAN21_HuMo(self, image_to_video=True, device=device) + out = model_base.WAN21_HuMo(self, image_to_video=False, device=device) return out class WAN22_S2V(WAN21_T2V):