Skip to content

Commit f540890

Browse files
committed
Merge branch 'master' into assets-redo
2 parents 5f7e091 + e4c61d7 commit f540890

31 files changed

+585
-129
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
119119

120120
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
121121
- Releases a new stable version (e.g., v0.7.0) roughly every week.
122+
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
123+
- Minor versions will be used for releases off the master branch.
124+
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
122125
- Commits outside of the stable release tags may be very unstable and break many custom nodes.
123126
- Serves as the foundation for the desktop release
124127

comfy/context_windows.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: Inde
143143
# if multiple conds, split based on primary region
144144
if self.split_conds_to_windows and len(cond_in) > 1:
145145
region = window.get_region_index(len(cond_in))
146-
logging.info(f"Splitting conds to windows; using region {region} for window {window[0]}-{window[-1]} with center ratio {window.center_ratio:.3f}")
146+
logging.info(f"Splitting conds to windows; using region {region} for window {window.index_list[0]}-{window.index_list[-1]} with center ratio {window.center_ratio:.3f}")
147147
cond_in = [cond_in[region]]
148148
# cond object is a list containing a dict - outer list is irrelevant, so just loop through it
149149
for actual_cond in cond_in:

comfy/ldm/lumina/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,7 @@ def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, trans
625625
if pooled is not None:
626626
pooled = self.clip_text_pooled_proj(pooled)
627627
else:
628-
pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
628+
pooled = torch.zeros((x.shape[0], self.clip_text_dim), device=x.device, dtype=x.dtype)
629629

630630
adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
631631

comfy/ldm/qwen_image/model.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def apply_rotary_emb(x, freqs_cis):
6161

6262

6363
class QwenTimestepProjEmbeddings(nn.Module):
64-
def __init__(self, embedding_dim, pooled_projection_dim, dtype=None, device=None, operations=None):
64+
def __init__(self, embedding_dim, pooled_projection_dim, use_additional_t_cond=False, dtype=None, device=None, operations=None):
6565
super().__init__()
6666
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
6767
self.timestep_embedder = TimestepEmbedding(
@@ -72,9 +72,19 @@ def __init__(self, embedding_dim, pooled_projection_dim, dtype=None, device=None
7272
operations=operations
7373
)
7474

75-
def forward(self, timestep, hidden_states):
75+
self.use_additional_t_cond = use_additional_t_cond
76+
if self.use_additional_t_cond:
77+
self.addition_t_embedding = operations.Embedding(2, embedding_dim, device=device, dtype=dtype)
78+
79+
def forward(self, timestep, hidden_states, addition_t_cond=None):
7680
timesteps_proj = self.time_proj(timestep)
7781
timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
82+
83+
if self.use_additional_t_cond:
84+
if addition_t_cond is None:
85+
addition_t_cond = torch.zeros((timesteps_emb.shape[0]), device=timesteps_emb.device, dtype=torch.long)
86+
timesteps_emb += self.addition_t_embedding(addition_t_cond, out_dtype=timesteps_emb.dtype)
87+
7888
return timesteps_emb
7989

8090

@@ -320,11 +330,11 @@ def __init__(
320330
num_attention_heads: int = 24,
321331
joint_attention_dim: int = 3584,
322332
pooled_projection_dim: int = 768,
323-
guidance_embeds: bool = False,
324333
axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
325334
default_ref_method="index",
326335
image_model=None,
327336
final_layer=True,
337+
use_additional_t_cond=False,
328338
dtype=None,
329339
device=None,
330340
operations=None,
@@ -342,6 +352,7 @@ def __init__(
342352
self.time_text_embed = QwenTimestepProjEmbeddings(
343353
embedding_dim=self.inner_dim,
344354
pooled_projection_dim=pooled_projection_dim,
355+
use_additional_t_cond=use_additional_t_cond,
345356
dtype=dtype,
346357
device=device,
347358
operations=operations
@@ -375,36 +386,42 @@ def process_img(self, x, index=0, h_offset=0, w_offset=0):
375386
patch_size = self.patch_size
376387
hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size))
377388
orig_shape = hidden_states.shape
378-
hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
379-
hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
380-
hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
389+
hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-3], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
390+
hidden_states = hidden_states.permute(0, 2, 3, 5, 1, 4, 6)
391+
hidden_states = hidden_states.reshape(orig_shape[0], orig_shape[-3] * (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
392+
t_len = t
381393
h_len = ((h + (patch_size // 2)) // patch_size)
382394
w_len = ((w + (patch_size // 2)) // patch_size)
383395

384396
h_offset = ((h_offset + (patch_size // 2)) // patch_size)
385397
w_offset = ((w_offset + (patch_size // 2)) // patch_size)
386398

387-
img_ids = torch.zeros((h_len, w_len, 3), device=x.device)
388-
img_ids[:, :, 0] = img_ids[:, :, 1] + index
389-
img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2)
390-
img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2)
391-
return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape
399+
img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device)
400+
401+
if t_len > 1:
402+
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).unsqueeze(1).unsqueeze(1)
403+
else:
404+
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + index
405+
406+
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1).unsqueeze(0) - (h_len // 2)
407+
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0).unsqueeze(0) - (w_len // 2)
408+
return hidden_states, repeat(img_ids, "t h w c -> b (t h w) c", b=bs), orig_shape
392409

393-
def forward(self, x, timestep, context, attention_mask=None, guidance=None, ref_latents=None, transformer_options={}, **kwargs):
410+
def forward(self, x, timestep, context, attention_mask=None, ref_latents=None, additional_t_cond=None, transformer_options={}, **kwargs):
394411
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
395412
self._forward,
396413
self,
397414
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
398-
).execute(x, timestep, context, attention_mask, guidance, ref_latents, transformer_options, **kwargs)
415+
).execute(x, timestep, context, attention_mask, ref_latents, additional_t_cond, transformer_options, **kwargs)
399416

400417
def _forward(
401418
self,
402419
x,
403420
timesteps,
404421
context,
405422
attention_mask=None,
406-
guidance: torch.Tensor = None,
407423
ref_latents=None,
424+
additional_t_cond=None,
408425
transformer_options={},
409426
control=None,
410427
**kwargs
@@ -423,12 +440,17 @@ def _forward(
423440
index = 0
424441
ref_method = kwargs.get("ref_latents_method", self.default_ref_method)
425442
index_ref_method = (ref_method == "index") or (ref_method == "index_timestep_zero")
443+
negative_ref_method = ref_method == "negative_index"
426444
timestep_zero = ref_method == "index_timestep_zero"
427445
for ref in ref_latents:
428446
if index_ref_method:
429447
index += 1
430448
h_offset = 0
431449
w_offset = 0
450+
elif negative_ref_method:
451+
index -= 1
452+
h_offset = 0
453+
w_offset = 0
432454
else:
433455
index = 1
434456
h_offset = 0
@@ -458,14 +480,7 @@ def _forward(
458480
encoder_hidden_states = self.txt_norm(encoder_hidden_states)
459481
encoder_hidden_states = self.txt_in(encoder_hidden_states)
460482

461-
if guidance is not None:
462-
guidance = guidance * 1000
463-
464-
temb = (
465-
self.time_text_embed(timestep, hidden_states)
466-
if guidance is None
467-
else self.time_text_embed(timestep, guidance, hidden_states)
468-
)
483+
temb = self.time_text_embed(timestep, hidden_states, additional_t_cond)
469484

470485
patches_replace = transformer_options.get("patches_replace", {})
471486
patches = transformer_options.get("patches", {})
@@ -513,6 +528,6 @@ def block_wrap(args):
513528
hidden_states = self.norm_out(hidden_states, temb)
514529
hidden_states = self.proj_out(hidden_states)
515530

516-
hidden_states = hidden_states[:, :num_embeds].view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
517-
hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5)
531+
hidden_states = hidden_states[:, :num_embeds].view(orig_shape[0], orig_shape[-3], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
532+
hidden_states = hidden_states.permute(0, 4, 1, 2, 5, 3, 6)
518533
return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]]

comfy/model_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1110,7 +1110,7 @@ def extra_conds(self, **kwargs):
11101110
if 'num_tokens' not in out:
11111111
out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
11121112

1113-
clip_text_pooled = kwargs["pooled_output"] # Newbie
1113+
clip_text_pooled = kwargs.get("pooled_output", None) # NewBie
11141114
if clip_text_pooled is not None:
11151115
out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
11161116

comfy/model_detection.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
430430
dit_config["rope_theta"] = 10000.0
431431
dit_config["ffn_dim_multiplier"] = 4.0
432432
ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
433-
if ctd_weight is not None:
433+
if ctd_weight is not None: # NewBie
434434
dit_config["clip_text_dim"] = ctd_weight.shape[0]
435+
# NewBie also sets axes_lens = [1024, 512, 512] but it's not used in ComfyUI
435436
elif dit_config["dim"] == 3840: # Z image
436437
dit_config["n_heads"] = 30
437438
dit_config["n_kv_heads"] = 30
@@ -620,6 +621,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
620621
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
621622
if "{}__index_timestep_zero__".format(key_prefix) in state_dict_keys: # 2511
622623
dit_config["default_ref_method"] = "index_timestep_zero"
624+
if "{}time_text_embed.addition_t_embedding.weight".format(key_prefix) in state_dict_keys: # Layered
625+
dit_config["use_additional_t_cond"] = True
626+
dit_config["default_ref_method"] = "negative_index"
623627
return dit_config
624628

625629
if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5

comfy/model_management.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import platform
2727
import weakref
2828
import gc
29+
import os
2930

3031
class VRAMState(Enum):
3132
DISABLED = 0 #No vram present: no need to move models to vram
@@ -333,13 +334,15 @@ def amd_min_version(device=None, min_rdna_version=0):
333334
SUPPORT_FP8_OPS = args.supports_fp8_compute
334335

335336
AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
337+
AMD_ENABLE_MIOPEN_ENV = 'COMFYUI_ENABLE_MIOPEN'
336338

337339
try:
338340
if is_amd():
339341
arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
340342
if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
341-
torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD
342-
logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
343+
if os.getenv(AMD_ENABLE_MIOPEN_ENV) != '1':
344+
torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD
345+
logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
343346

344347
try:
345348
rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))

comfy/samplers.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -984,9 +984,6 @@ def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None,
984984
self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
985985
device = self.model_patcher.load_device
986986

987-
if denoise_mask is not None:
988-
denoise_mask = comfy.sampler_helpers.prepare_mask(denoise_mask, noise.shape, device)
989-
990987
noise = noise.to(device)
991988
latent_image = latent_image.to(device)
992989
sigmas = sigmas.to(device)
@@ -1013,6 +1010,24 @@ def sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callba
10131010
else:
10141011
latent_shapes = [latent_image.shape]
10151012

1013+
if denoise_mask is not None:
1014+
if denoise_mask.is_nested:
1015+
denoise_masks = denoise_mask.unbind()
1016+
denoise_masks = denoise_masks[:len(latent_shapes)]
1017+
else:
1018+
denoise_masks = [denoise_mask]
1019+
1020+
for i in range(len(denoise_masks), len(latent_shapes)):
1021+
denoise_masks.append(torch.ones(latent_shapes[i]))
1022+
1023+
for i in range(len(denoise_masks)):
1024+
denoise_masks[i] = comfy.sampler_helpers.prepare_mask(denoise_masks[i], latent_shapes[i], self.model_patcher.load_device)
1025+
1026+
if len(denoise_masks) > 1:
1027+
denoise_mask, _ = comfy.utils.pack_latents(denoise_masks)
1028+
else:
1029+
denoise_mask = denoise_masks[0]
1030+
10161031
self.conds = {}
10171032
for k in self.original_conds:
10181033
self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k]))

comfy/sd.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
import comfy.text_encoders.z_image
5656
import comfy.text_encoders.ovis
5757
import comfy.text_encoders.kandinsky5
58+
import comfy.text_encoders.jina_clip_2
59+
import comfy.text_encoders.newbie
5860

5961
import comfy.model_patcher
6062
import comfy.lora
@@ -1008,6 +1010,7 @@ class CLIPType(Enum):
10081010
OVIS = 21
10091011
KANDINSKY5 = 22
10101012
KANDINSKY5_IMAGE = 23
1013+
NEWBIE = 24
10111014

10121015

10131016
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -1038,6 +1041,7 @@ class TEModel(Enum):
10381041
MISTRAL3_24B_PRUNED_FLUX2 = 15
10391042
QWEN3_4B = 16
10401043
QWEN3_2B = 17
1044+
JINA_CLIP_2 = 18
10411045

10421046

10431047
def detect_te_model(sd):
@@ -1047,6 +1051,8 @@ def detect_te_model(sd):
10471051
return TEModel.CLIP_H
10481052
if "text_model.encoder.layers.0.mlp.fc1.weight" in sd:
10491053
return TEModel.CLIP_L
1054+
if "model.encoder.layers.0.mixer.Wqkv.weight" in sd:
1055+
return TEModel.JINA_CLIP_2
10501056
if "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
10511057
weight = sd["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
10521058
if weight.shape[-1] == 4096:
@@ -1207,6 +1213,9 @@ class EmptyClass:
12071213
elif te_model == TEModel.QWEN3_2B:
12081214
clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
12091215
clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
1216+
elif te_model == TEModel.JINA_CLIP_2:
1217+
clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
1218+
clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
12101219
else:
12111220
# clip_l
12121221
if clip_type == CLIPType.SD3:
@@ -1262,6 +1271,17 @@ class EmptyClass:
12621271
elif clip_type == CLIPType.KANDINSKY5_IMAGE:
12631272
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
12641273
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage
1274+
elif clip_type == CLIPType.NEWBIE:
1275+
clip_target.clip = comfy.text_encoders.newbie.te(**llama_detect(clip_data))
1276+
clip_target.tokenizer = comfy.text_encoders.newbie.NewBieTokenizer
1277+
if "model.layers.0.self_attn.q_norm.weight" in clip_data[0]:
1278+
clip_data_gemma = clip_data[0]
1279+
clip_data_jina = clip_data[1]
1280+
else:
1281+
clip_data_gemma = clip_data[1]
1282+
clip_data_jina = clip_data[0]
1283+
tokenizer_data["gemma_spiece_model"] = clip_data_gemma.get("spiece_model", None)
1284+
tokenizer_data["jina_spiece_model"] = clip_data_jina.get("spiece_model", None)
12651285
else:
12661286
clip_target.clip = sdxl_clip.SDXLClipModel
12671287
clip_target.tokenizer = sdxl_clip.SDXLTokenizer

comfy/sd1_clip.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
466466
return embed_out
467467

468468
class SDTokenizer:
469-
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}):
469+
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}):
470470
if tokenizer_path is None:
471471
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
472472
self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
@@ -513,6 +513,8 @@ def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedd
513513
self.embedding_size = embedding_size
514514
self.embedding_key = embedding_key
515515

516+
self.disable_weights = disable_weights
517+
516518
def _try_get_embedding(self, embedding_name:str):
517519
'''
518520
Takes a potential embedding name and tries to retrieve it.
@@ -547,7 +549,7 @@ def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_optio
547549
min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)
548550

549551
text = escape_important(text)
550-
if kwargs.get("disable_weights", False):
552+
if kwargs.get("disable_weights", self.disable_weights):
551553
parsed_weights = [(text, 1.0)]
552554
else:
553555
parsed_weights = token_weights(text, 1.0)

0 commit comments

Comments
 (0)