|
21 | 21 | import comfy.text_encoders.omnigen2 |
22 | 22 | import comfy.text_encoders.qwen_image |
23 | 23 | import comfy.text_encoders.hunyuan_image |
| 24 | +import comfy.text_encoders.kandinsky5 |
24 | 25 | import comfy.text_encoders.z_image |
25 | 26 |
|
26 | 27 | from . import supported_models_base |
@@ -1474,7 +1475,60 @@ def clip_target(self, state_dict={}): |
1474 | 1475 | hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) |
1475 | 1476 | return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) |
1476 | 1477 |
|
1477 | | -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2] |
1478 | 1478 |
|
| 1479 | +class Kandinsky5(supported_models_base.BASE): |
| 1480 | + unet_config = { |
| 1481 | + "image_model": "kandinsky5", |
| 1482 | + } |
| 1483 | + |
| 1484 | + sampling_settings = { |
| 1485 | + "shift": 10.0, |
| 1486 | + } |
| 1487 | + |
| 1488 | + unet_extra_config = {} |
| 1489 | + latent_format = latent_formats.HunyuanVideo |
| 1490 | + |
| 1491 | + memory_usage_factor = 1.1 #TODO |
| 1492 | + |
| 1493 | + supported_inference_dtypes = [torch.bfloat16, torch.float32] |
| 1494 | + |
| 1495 | + vae_key_prefix = ["vae."] |
| 1496 | + text_encoder_key_prefix = ["text_encoders."] |
| 1497 | + |
| 1498 | + def get_model(self, state_dict, prefix="", device=None): |
| 1499 | + out = model_base.Kandinsky5(self, device=device) |
| 1500 | + return out |
| 1501 | + |
| 1502 | + def clip_target(self, state_dict={}): |
| 1503 | + pref = self.text_encoder_key_prefix[0] |
| 1504 | + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) |
| 1505 | + return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect)) |
| 1506 | + |
| 1507 | + |
| 1508 | +class Kandinsky5Image(Kandinsky5): |
| 1509 | + unet_config = { |
| 1510 | + "image_model": "kandinsky5", |
| 1511 | + "model_dim": 2560, |
| 1512 | + "visual_embed_dim": 64, |
| 1513 | + } |
| 1514 | + |
| 1515 | + sampling_settings = { |
| 1516 | + "shift": 3.0, |
| 1517 | + } |
| 1518 | + |
| 1519 | + latent_format = latent_formats.Flux |
| 1520 | + memory_usage_factor = 1.1 #TODO |
| 1521 | + |
| 1522 | + def get_model(self, state_dict, prefix="", device=None): |
| 1523 | + out = model_base.Kandinsky5Image(self, device=device) |
| 1524 | + return out |
| 1525 | + |
| 1526 | + def clip_target(self, state_dict={}): |
| 1527 | + pref = self.text_encoder_key_prefix[0] |
| 1528 | + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) |
| 1529 | + return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect)) |
| 1530 | + |
| 1531 | + |
| 1532 | +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5] |
1479 | 1533 |
|
1480 | 1534 | models += [SVD_img2vid] |
0 commit comments