Merge branch 'concedo_experimental' into esocrok

Nexesenex · Nexesenex · commit c9b79e66bf0e · 2025-10-02T05:07:47.000+02:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -69,7 +69,7 @@
 extra_images_max = 4
 
 # global vars
-KcppVersion = "1.99.4"
+KcppVersion = "1.100"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_model": "", "currentConfig": None, "modelOverride": None, "currentModel": None}
@@ -1731,7 +1731,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
     inputs.flash_attention = args.sdflashattention
     inputs.offload_cpu = args.sdoffloadcpu
     inputs.vae_cpu = args.sdvaecpu
-    inputs.clip_cpu = args.sdclipcpu
+    inputs.clip_cpu = False if args.sdclipgpu else True
     sdconvdirect = sd_convdirect_option(args.sdconvdirect)
     inputs.diffusion_conv_direct = sdconvdirect == 'full'
     inputs.vae_conv_direct = sdconvdirect in ['vaeonly', 'full']
@@ -5718,7 +5718,7 @@ def hide_tooltip(event):
     sd_flash_attention_var = ctk.IntVar(value=0)
     sd_offload_cpu_var = ctk.IntVar(value=0)
     sd_vae_cpu_var = ctk.IntVar(value=0)
-    sd_clip_cpu_var = ctk.IntVar(value=0)
+    sd_clip_gpu_var = ctk.IntVar(value=0)
     sd_vaeauto_var = ctk.IntVar(value=0)
     sd_tiled_vae_var = ctk.StringVar(value=str(default_vae_tile_threshold))
     sd_convdirect_var = ctk.StringVar(value=str(sd_convdirect_choices[0]))
@@ -6503,7 +6503,7 @@ def toggletaesd(a,b,c):
     makecheckbox(images_tab, "SD Flash Attention", sd_flash_attention_var, 44,padx=230, tooltiptxt="Enable Flash Attention for image diffusion. May save memory or improve performance.")
     makecheckbox(images_tab, "Model CPU Offload", sd_offload_cpu_var, 50,padx=8, tooltiptxt="Offload image weights in RAM to save VRAM, swap into VRAM when needed.")
     makecheckbox(images_tab, "VAE on CPU", sd_vae_cpu_var, 50,padx=160, tooltiptxt="Force VAE to CPU only for image generation.")
-    makecheckbox(images_tab, "CLIP on CPU", sd_clip_cpu_var, 50,padx=280, tooltiptxt="Force CLIP to CPU only for image generation.")
+    makecheckbox(images_tab, "CLIP on GPU", sd_clip_gpu_var, 50,padx=280, tooltiptxt="Put CLIP and T5 to GPU for image generation. Otherwise, CLIP will use CPU.")
 
     # audio tab
     audio_tab = tabcontent["Audio"]
@@ -6751,8 +6751,8 @@ def export_vars():
             args.sdoffloadcpu = True
         if sd_vae_cpu_var.get()==1:
             args.sdvaecpu = True
-        if sd_clip_cpu_var.get()==1:
-            args.sdclipcpu = True
+        if sd_clip_gpu_var.get()==1:
+            args.sdclipgpu = True
         args.sdthreads = (0 if sd_threads_var.get()=="" else int(sd_threads_var.get()))
         args.sdclamped = (0 if int(sd_clamped_var.get())<=0 else int(sd_clamped_var.get()))
         args.sdclampedsoft = (0 if int(sd_clamped_soft_var.get())<=0 else int(sd_clamped_soft_var.get()))
@@ -6997,7 +6997,7 @@ def import_vars(dict):
         sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
         sd_offload_cpu_var.set(1 if ("sdoffloadcpu" in dict and dict["sdoffloadcpu"]) else 0)
         sd_vae_cpu_var.set(1 if ("sdvaecpu" in dict and dict["sdvaecpu"]) else 0)
-        sd_clip_cpu_var.set(1 if ("sdclipcpu" in dict and dict["sdclipcpu"]) else 0)
+        sd_clip_gpu_var.set(1 if ("sdclipgpu" in dict and dict["sdclipgpu"]) else 0)
         sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
         sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
         sd_t5xxl_var.set(dict["sdt5xxl"] if ("sdt5xxl" in dict and dict["sdt5xxl"]) else "")
@@ -8868,7 +8868,7 @@ def range_checker(arg: str):
     sdparsergroup.add_argument("--sdflashattention", help="Enables Flash Attention for image generation.", action='store_true')
     sdparsergroup.add_argument("--sdoffloadcpu", help="Offload image weights in RAM to save VRAM, swap into VRAM when needed.", action='store_true')
     sdparsergroup.add_argument("--sdvaecpu", help="Force VAE to CPU only for image generation.", action='store_true')
-    sdparsergroup.add_argument("--sdclipcpu", help="Force CLIP to CPU only for image generation.", action='store_true')
+    sdparsergroup.add_argument("--sdclipgpu", help="Put CLIP and T5 to GPU for image generation. Otherwise, CLIP will use CPU.", action='store_true')
     sdparsergroup.add_argument("--sdconvdirect", help="Enables Conv2D Direct. May improve performance or reduce memory usage. Might crash if not supported by the backend. Can be 'off' (default) to disable, 'full' to turn it on for all operations, or 'vaeonly' to enable only for the VAE.", type=sd_convdirect_option, choices=sd_convdirect_choices, default=sd_convdirect_choices[0])
     sdparsergroupvae = sdparsergroup.add_mutually_exclusive_group()
     sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -679,17 +679,32 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     }
 
     std::vector<sd_image_t> reference_imgs;
+    std::vector<sd_image_t> wan_imgs;
     bool is_wan = (loadedsdver == SDVersion::VERSION_WAN2 || loadedsdver == SDVersion::VERSION_WAN2_2_I2V || loadedsdver == SDVersion::VERSION_WAN2_2_TI2V);
     bool is_kontext = (loadedsdver==SDVersion::VERSION_FLUX && !loaded_model_is_chroma(sd_ctx));
-    if(extra_image_data.size()>0 && (is_wan || is_kontext))
+    if(extra_image_data.size()>0)
     {
-        for(int i=0;i<extra_image_data.size();++i)
+        if(is_kontext)
         {
-            reference_imgs.push_back(extraimage_references[i]);
+            for(int i=0;i<extra_image_data.size();++i)
+            {
+                reference_imgs.push_back(extraimage_references[i]);
+            }
+            if(!sd_is_quiet && sddebugmode==1)
+            {
+                printf("\nImage Gen: Using %d reference images\n",reference_imgs.size());
+            }
         }
-        if(!sd_is_quiet && sddebugmode==1)
+        if(is_wan)
         {
-            printf("\nImage Gen: Using %d reference images\n",reference_imgs.size());
+            for(int i=0;i<extra_image_data.size();++i)
+            {
+                wan_imgs.push_back(extraimage_references[i]);
+            }
+            if(!sd_is_quiet && sddebugmode==1)
+            {
+                printf("\nImage Gen: Using %d video reference images\n",wan_imgs.size());
+            }
         }
     }
 
@@ -727,7 +742,6 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
 
     params.ref_images = reference_imgs.data();
     params.ref_images_count = reference_imgs.size();
-
     params.pm_params.id_images = photomaker_imgs.data();
     params.pm_params.id_images_count = photomaker_imgs.size();
 
@@ -752,15 +766,15 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
         vid_gen_params.strength = params.strength;
         vid_gen_params.seed = params.seed;
         vid_gen_params.video_frames = vid_req_frames;
-        if(reference_imgs.size()>0)
+        if(wan_imgs.size()>0)
         {
-            if(reference_imgs.size()>=1)
+            if(wan_imgs.size()>=1)
             {
-                vid_gen_params.init_image = reference_imgs[0];
+                vid_gen_params.init_image = wan_imgs[0];
             }
-            if(reference_imgs.size()>=2)
+            if(wan_imgs.size()>=2)
             {
-                vid_gen_params.end_image = reference_imgs[1];
+                vid_gen_params.end_image = wan_imgs[1];
             }
         }
         if(!sd_is_quiet && sddebugmode==1)
@@ -775,7 +789,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
             << "\nSTRENGTH:" << vid_gen_params.strength
             << "\nFRAMES:"   << vid_gen_params.video_frames
             << "\nCTRL_FRM:" << vid_gen_params.control_frames_size
-            << "\nREF_IMGS:"   << reference_imgs.size()
+            << "\nINIT_IMGS:" << wan_imgs.size()
             << "\n\n";
             printf("%s", ss.str().c_str());
         }
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
@@ -401,17 +401,10 @@ class StableDiffusionGGML {
                 use_t5xxl = true;
             }
             if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
-                #if 0 // kcpp
                 LOG_WARN(
                     "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
                     "If you notice that the generated images are completely black,"
                     "try running the T5 model on the CPU using the --clip-on-cpu parameter.");
-                #else
-                if (conditioner_wtype != GGML_TYPE_F32) {
-                    LOG_INFO("CLIP: Forcing CPU backend for T5");
-                    clip_on_cpu = true;
-                }
-                #endif
             }
             if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                 LOG_INFO("CLIP: Using CPU backend");

Original file line number	Diff line number	Diff line change
`@@ -679,17 +679,32 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)`
`679`	`679`	`}`
`680`	`680`
`681`	`681`	`std::vector<sd_image_t> reference_imgs;`
	`682`	`+ std::vector<sd_image_t> wan_imgs;`
`682`	`683`	`bool is_wan = (loadedsdver == SDVersion::VERSION_WAN2 \|\| loadedsdver == SDVersion::VERSION_WAN2_2_I2V \|\| loadedsdver == SDVersion::VERSION_WAN2_2_TI2V);`
`683`	`684`	`bool is_kontext = (loadedsdver==SDVersion::VERSION_FLUX && !loaded_model_is_chroma(sd_ctx));`
`684`		`- if(extra_image_data.size()>0 && (is_wan \|\| is_kontext))`
	`685`	`+ if(extra_image_data.size()>0)`
`685`	`686`	`{`
`686`		`- for(int i=0;i<extra_image_data.size();++i)`
	`687`	`+ if(is_kontext)`
`687`	`688`	`{`
`688`		`- reference_imgs.push_back(extraimage_references[i]);`
	`689`	`+ for(int i=0;i<extra_image_data.size();++i)`
	`690`	`+ {`
	`691`	`+ reference_imgs.push_back(extraimage_references[i]);`
	`692`	`+ }`
	`693`	`+ if(!sd_is_quiet && sddebugmode==1)`
	`694`	`+ {`
	`695`	`+ printf("\nImage Gen: Using %d reference images\n",reference_imgs.size());`
	`696`	`+ }`
`689`	`697`	`}`
`690`		`- if(!sd_is_quiet && sddebugmode==1)`
	`698`	`+ if(is_wan)`
`691`	`699`	`{`
`692`		`- printf("\nImage Gen: Using %d reference images\n",reference_imgs.size());`
	`700`	`+ for(int i=0;i<extra_image_data.size();++i)`
	`701`	`+ {`
	`702`	`+ wan_imgs.push_back(extraimage_references[i]);`
	`703`	`+ }`
	`704`	`+ if(!sd_is_quiet && sddebugmode==1)`
	`705`	`+ {`
	`706`	`+ printf("\nImage Gen: Using %d video reference images\n",wan_imgs.size());`
	`707`	`+ }`
`693`	`708`	`}`
`694`	`709`	`}`
`695`	`710`
`@@ -727,7 +742,6 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)`
`727`	`742`
`728`	`743`	`params.ref_images = reference_imgs.data();`
`729`	`744`	`params.ref_images_count = reference_imgs.size();`
`730`		`-`
`731`	`745`	`params.pm_params.id_images = photomaker_imgs.data();`
`732`	`746`	`params.pm_params.id_images_count = photomaker_imgs.size();`
`733`	`747`
`@@ -752,15 +766,15 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)`
`752`	`766`	`vid_gen_params.strength = params.strength;`
`753`	`767`	`vid_gen_params.seed = params.seed;`
`754`	`768`	`vid_gen_params.video_frames = vid_req_frames;`
`755`		`- if(reference_imgs.size()>0)`
	`769`	`+ if(wan_imgs.size()>0)`
`756`	`770`	`{`
`757`		`- if(reference_imgs.size()>=1)`
	`771`	`+ if(wan_imgs.size()>=1)`
`758`	`772`	`{`
`759`		`- vid_gen_params.init_image = reference_imgs[0];`
	`773`	`+ vid_gen_params.init_image = wan_imgs[0];`
`760`	`774`	`}`
`761`		`- if(reference_imgs.size()>=2)`
	`775`	`+ if(wan_imgs.size()>=2)`
`762`	`776`	`{`
`763`		`- vid_gen_params.end_image = reference_imgs[1];`
	`777`	`+ vid_gen_params.end_image = wan_imgs[1];`
`764`	`778`	`}`
`765`	`779`	`}`
`766`	`780`	`if(!sd_is_quiet && sddebugmode==1)`
`@@ -775,7 +789,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)`
`775`	`789`	`<< "\nSTRENGTH:" << vid_gen_params.strength`
`776`	`790`	`<< "\nFRAMES:" << vid_gen_params.video_frames`
`777`	`791`	`<< "\nCTRL_FRM:" << vid_gen_params.control_frames_size`
`778`		`- << "\nREF_IMGS:" << reference_imgs.size()`
	`792`	`+ << "\nINIT_IMGS:" << wan_imgs.size()`
`779`	`793`	`<< "\n\n";`
`780`	`794`	`printf("%s", ss.str().c_str());`
`781`	`795`	`}`