Skip to content

Commit e2bc630

Browse files
committed
Merge branch 'master' into v3-improvements
2 parents f0e0a19 + 7ac7d69 commit e2bc630

21 files changed

+1171
-295
lines changed

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,32 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step
320320
1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
321321
2. Launch ComfyUI by running `python main.py`
322322

323+
324+
## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
325+
326+
**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
327+
328+
### Setup
329+
330+
1. Install the manager dependencies:
331+
```bash
332+
pip install -r manager_requirements.txt
333+
```
334+
335+
2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
336+
```bash
337+
python main.py --enable-manager
338+
```
339+
340+
### Command Line Options
341+
342+
| Flag | Description |
343+
|------|-------------|
344+
| `--enable-manager` | Enable ComfyUI-Manager |
345+
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
346+
| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
347+
348+
323349
# Running
324350

325351
```python main.py```

comfy/ldm/kandinsky5/model.py

Lines changed: 407 additions & 0 deletions
Large diffs are not rendered by default.

comfy/lora.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,13 @@ def model_lora_keys_unet(model, key_map={}):
322322
key_map["diffusion_model.{}".format(key_lora)] = to
323323
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
324324

325+
if isinstance(model, comfy.model_base.Kandinsky5):
326+
for k in sdk:
327+
if k.startswith("diffusion_model.") and k.endswith(".weight"):
328+
key_lora = k[len("diffusion_model."):-len(".weight")]
329+
key_map["{}".format(key_lora)] = k
330+
key_map["transformer.{}".format(key_lora)] = k
331+
325332
return key_map
326333

327334

comfy/model_base.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import comfy.ldm.ace.model
4848
import comfy.ldm.omnigen.omnigen2
4949
import comfy.ldm.qwen_image.model
50+
import comfy.ldm.kandinsky5.model
5051

5152
import comfy.model_management
5253
import comfy.patcher_extension
@@ -1630,3 +1631,49 @@ def extra_conds(self, **kwargs):
16301631
out = super().extra_conds(**kwargs)
16311632
out['disable_time_r'] = comfy.conds.CONDConstant(False)
16321633
return out
1634+
1635+
class Kandinsky5(BaseModel):
1636+
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
1637+
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.kandinsky5.model.Kandinsky5)
1638+
1639+
def encode_adm(self, **kwargs):
1640+
return kwargs["pooled_output"]
1641+
1642+
def concat_cond(self, **kwargs):
1643+
noise = kwargs.get("noise", None)
1644+
device = kwargs["device"]
1645+
image = torch.zeros_like(noise)
1646+
1647+
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
1648+
if mask is None:
1649+
mask = torch.zeros_like(noise)[:, :1]
1650+
else:
1651+
mask = 1.0 - mask
1652+
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
1653+
if mask.shape[-3] < noise.shape[-3]:
1654+
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
1655+
mask = utils.resize_to_batch_size(mask, noise.shape[0])
1656+
1657+
return torch.cat((image, mask), dim=1)
1658+
1659+
def extra_conds(self, **kwargs):
1660+
out = super().extra_conds(**kwargs)
1661+
attention_mask = kwargs.get("attention_mask", None)
1662+
if attention_mask is not None:
1663+
out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
1664+
cross_attn = kwargs.get("cross_attn", None)
1665+
if cross_attn is not None:
1666+
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
1667+
1668+
time_dim_replace = kwargs.get("time_dim_replace", None)
1669+
if time_dim_replace is not None:
1670+
out['time_dim_replace'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_replace))
1671+
1672+
return out
1673+
1674+
class Kandinsky5Image(Kandinsky5):
1675+
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
1676+
super().__init__(model_config, model_type, device=device)
1677+
1678+
def concat_cond(self, **kwargs):
1679+
return None

comfy/model_detection.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,24 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
611611
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
612612
return dit_config
613613

614+
if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5
615+
dit_config = {}
616+
model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
617+
dit_config["model_dim"] = model_dim
618+
if model_dim in [4096, 2560]: # pro video and lite image
619+
dit_config["axes_dims"] = (32, 48, 48)
620+
if model_dim == 2560: # lite image
621+
dit_config["rope_scale_factor"] = (1.0, 1.0, 1.0)
622+
elif model_dim == 1792: # lite video
623+
dit_config["axes_dims"] = (16, 24, 24)
624+
dit_config["time_dim"] = state_dict['{}time_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
625+
dit_config["image_model"] = "kandinsky5"
626+
dit_config["ff_dim"] = state_dict['{}visual_transformer_blocks.0.feed_forward.in_layer.weight'.format(key_prefix)].shape[0]
627+
dit_config["visual_embed_dim"] = state_dict['{}visual_embeddings.in_layer.weight'.format(key_prefix)].shape[1]
628+
dit_config["num_text_blocks"] = count_blocks(state_dict_keys, '{}text_transformer_blocks.'.format(key_prefix) + '{}.')
629+
dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
630+
return dit_config
631+
614632
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
615633
return None
616634

comfy/sd.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import comfy.text_encoders.hunyuan_image
5555
import comfy.text_encoders.z_image
5656
import comfy.text_encoders.ovis
57+
import comfy.text_encoders.kandinsky5
5758

5859
import comfy.model_patcher
5960
import comfy.lora
@@ -766,6 +767,8 @@ def decode(self, samples_in, vae_options={}):
766767
self.throw_exception_if_invalid()
767768
pixel_samples = None
768769
do_tile = False
770+
if self.latent_dim == 2 and samples_in.ndim == 5:
771+
samples_in = samples_in[:, :, 0]
769772
try:
770773
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
771774
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
@@ -983,6 +986,8 @@ class CLIPType(Enum):
983986
HUNYUAN_IMAGE = 19
984987
HUNYUAN_VIDEO_15 = 20
985988
OVIS = 21
989+
KANDINSKY5 = 22
990+
KANDINSKY5_IMAGE = 23
986991

987992

988993
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -1231,6 +1236,12 @@ class EmptyClass:
12311236
elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
12321237
clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
12331238
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
1239+
elif clip_type == CLIPType.KANDINSKY5:
1240+
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
1241+
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer
1242+
elif clip_type == CLIPType.KANDINSKY5_IMAGE:
1243+
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
1244+
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage
12341245
else:
12351246
clip_target.clip = sdxl_clip.SDXLClipModel
12361247
clip_target.tokenizer = sdxl_clip.SDXLTokenizer

comfy/supported_models.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import comfy.text_encoders.omnigen2
2222
import comfy.text_encoders.qwen_image
2323
import comfy.text_encoders.hunyuan_image
24+
import comfy.text_encoders.kandinsky5
2425
import comfy.text_encoders.z_image
2526

2627
from . import supported_models_base
@@ -1474,7 +1475,60 @@ def clip_target(self, state_dict={}):
14741475
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
14751476
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
14761477

1477-
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2]
14781478

1479+
class Kandinsky5(supported_models_base.BASE):
1480+
unet_config = {
1481+
"image_model": "kandinsky5",
1482+
}
1483+
1484+
sampling_settings = {
1485+
"shift": 10.0,
1486+
}
1487+
1488+
unet_extra_config = {}
1489+
latent_format = latent_formats.HunyuanVideo
1490+
1491+
memory_usage_factor = 1.1 #TODO
1492+
1493+
supported_inference_dtypes = [torch.bfloat16, torch.float32]
1494+
1495+
vae_key_prefix = ["vae."]
1496+
text_encoder_key_prefix = ["text_encoders."]
1497+
1498+
def get_model(self, state_dict, prefix="", device=None):
1499+
out = model_base.Kandinsky5(self, device=device)
1500+
return out
1501+
1502+
def clip_target(self, state_dict={}):
1503+
pref = self.text_encoder_key_prefix[0]
1504+
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
1505+
return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
1506+
1507+
1508+
class Kandinsky5Image(Kandinsky5):
1509+
unet_config = {
1510+
"image_model": "kandinsky5",
1511+
"model_dim": 2560,
1512+
"visual_embed_dim": 64,
1513+
}
1514+
1515+
sampling_settings = {
1516+
"shift": 3.0,
1517+
}
1518+
1519+
latent_format = latent_formats.Flux
1520+
memory_usage_factor = 1.1 #TODO
1521+
1522+
def get_model(self, state_dict, prefix="", device=None):
1523+
out = model_base.Kandinsky5Image(self, device=device)
1524+
return out
1525+
1526+
def clip_target(self, state_dict={}):
1527+
pref = self.text_encoder_key_prefix[0]
1528+
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
1529+
return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
1530+
1531+
1532+
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
14791533

14801534
models += [SVD_img2vid]

comfy/supported_models_base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"""
1818

1919
import torch
20+
import logging
2021
from . import model_base
2122
from . import utils
2223
from . import latent_formats
@@ -117,3 +118,7 @@ def process_vae_state_dict_for_saving(self, state_dict):
117118
def set_inference_dtype(self, dtype, manual_cast_dtype):
118119
self.unet_config['dtype'] = dtype
119120
self.manual_cast_dtype = manual_cast_dtype
121+
122+
def __getattr__(self, name):
123+
logging.warning("\nWARNING, you accessed {} from the model config object which doesn't exist. Please fix your code.\n".format(name))
124+
return None

comfy/text_encoders/kandinsky5.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from comfy import sd1_clip
2+
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
3+
from .llama import Qwen25_7BVLI
4+
5+
6+
class Kandinsky5Tokenizer(QwenImageTokenizer):
7+
def __init__(self, embedding_directory=None, tokenizer_data={}):
8+
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
9+
self.llama_template = "<|im_start|>system\nYou are a prompt engineer. Describe the video in detail.\nDescribe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.\nDescribe the location of the video, main characters or objects and their action.\nDescribe the dynamism of the video and presented actions.\nName the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.\nDescribe the visual effects, postprocessing and transitions if they are presented in the video.\nPay attention to the order of key actions shown in the scene.<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
10+
self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
11+
12+
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
13+
out = super().tokenize_with_weights(text, return_word_ids, **kwargs)
14+
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
15+
16+
return out
17+
18+
19+
class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
20+
def __init__(self, embedding_directory=None, tokenizer_data={}):
21+
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
22+
self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
23+
24+
25+
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
26+
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}):
27+
llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None)
28+
if llama_scaled_fp8 is not None:
29+
model_options = model_options.copy()
30+
model_options["scaled_fp8"] = llama_scaled_fp8
31+
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
32+
33+
34+
class Kandinsky5TEModel(QwenImageTEModel):
35+
def __init__(self, device="cpu", dtype=None, model_options={}):
36+
super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
37+
self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
38+
39+
def encode_token_weights(self, token_weight_pairs):
40+
cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=-1)
41+
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs["l"])
42+
43+
return cond, l_pooled, extra
44+
45+
def set_clip_options(self, options):
46+
super().set_clip_options(options)
47+
self.clip_l.set_clip_options(options)
48+
49+
def reset_clip_options(self):
50+
super().reset_clip_options()
51+
self.clip_l.reset_clip_options()
52+
53+
def load_sd(self, sd):
54+
if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
55+
return self.clip_l.load_sd(sd)
56+
else:
57+
return super().load_sd(sd)
58+
59+
def te(dtype_llama=None, llama_scaled_fp8=None):
60+
class Kandinsky5TEModel_(Kandinsky5TEModel):
61+
def __init__(self, device="cpu", dtype=None, model_options={}):
62+
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
63+
model_options = model_options.copy()
64+
model_options["qwen_scaled_fp8"] = llama_scaled_fp8
65+
if dtype_llama is not None:
66+
dtype = dtype_llama
67+
super().__init__(device=device, dtype=dtype, model_options=model_options)
68+
return Kandinsky5TEModel_

comfy_api/latest/_io.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,8 @@ class PooledDict(TypedDict):
568568
'''Used by WAN Camera.'''
569569
time_dim_concat: NotRequired[torch.Tensor]
570570
'''Used by WAN Phantom Subject.'''
571+
time_dim_replace: NotRequired[torch.Tensor]
572+
'''Used by Kandinsky5 I2V.'''
571573

572574
CondList = list[tuple[torch.Tensor, PooledDict]]
573575
Type = CondList

0 commit comments

Comments
 (0)