additional options for image generation (#1765)

wbruna · web-flow · commit c48999f7c0cb · 2025-10-05T23:36:20.000+08:00
* sd: add backend support for choosing the default sampler

* use the default sampler on the API

* sd: add backend support for the scheduler

* sd: add backend support for distilled guidance

* sd: add backend support for timestep-shift

* sd: add a config field to set default image gen options
diff --git a/expose.h b/expose.h
@@ -197,11 +197,14 @@ struct sd_generation_inputs
     const bool flip_mask = false;
     const float denoising_strength = 0.0f;
     const float cfg_scale = 0.0f;
+    const float distilled_guidance = -1.0f;
+    const int shifted_timestep = 0;
     const int sample_steps = 0;
     const int width = 0;
     const int height = 0;
     const int seed = 0;
     const char * sample_method = nullptr;
+    const char * scheduler = nullptr;
     const int clip_skip = -1;
     const int vid_req_frames = 1;
     const int vid_req_avi = 0;
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -311,11 +311,14 @@ class sd_generation_inputs(ctypes.Structure):
                 ("flip_mask", ctypes.c_bool),
                 ("denoising_strength", ctypes.c_float),
                 ("cfg_scale", ctypes.c_float),
+                ("distilled_guidance", ctypes.c_float),
+                ("shifted_timestep", ctypes.c_int),
                 ("sample_steps", ctypes.c_int),
                 ("width", ctypes.c_int),
                 ("height", ctypes.c_int),
                 ("seed", ctypes.c_int),
                 ("sample_method", ctypes.c_char_p),
+                ("scheduler", ctypes.c_char_p),
                 ("clip_skip", ctypes.c_int),
                 ("vid_req_frames", ctypes.c_int),
                 ("vid_req_avi", ctypes.c_int)]
@@ -393,6 +396,8 @@ class embeddings_generation_outputs(ctypes.Structure):
                 ("count", ctypes.c_int),
                 ("data", ctypes.c_char_p)]
 
+
+
 def getdirpath():
     return os.path.dirname(os.path.realpath(__file__))
 def getabspath():
@@ -1788,9 +1793,58 @@ def sd_comfyui_tranform_params(genparams):
         print("Warning: ComfyUI Payload Missing!")
     return genparams
 
+def sd_process_meta_fields(fields, config):
+    # aliases to match sd.cpp command-line options
+    aliases = {
+        'cfg-scale': 'cfg_scale',
+        'guidance': 'distilled_guidance',
+        'sampler': 'sampler_name',
+        'sampling-method': 'sampler_name',
+        'timestep-shift': 'shifted_timestep',
+    }
+    fields_dict = {aliases.get(k, k): v for k, v in fields}
+    # whitelist accepted parameters
+    whitelist = ['scheduler', 'shifted_timestep', 'distilled_guidance']
+    if config:
+        # note the current UI always set these
+        whitelist += ['sampler_name', 'cfg_scale']
+    fields_dict = {k: v for k, v in fields_dict.items() if k in whitelist}
+    return fields_dict
+
+# json with top-level dict
+def sd_parse_meta_field(prompt, config=False):
+    jfields = {}
+    try:
+        jfields = json.loads(prompt)
+    except json.JSONDecodeError:
+        # accept "field":"value",... without {} (also empty strings)
+        try:
+            jfields = json.loads('{ ' + prompt + ' }')
+        except json.JSONDecodeError:
+            print("Warning: couldn't parse meta prompt; it should be valid JSON.")
+    if not isinstance(jfields, dict):
+        jfields = {}
+    kv_dict = sd_process_meta_fields(jfields.items(), config)
+    return kv_dict
+
+
 def sd_generate(genparams):
     global maxctx, args, currentusergenkey, totalgens, pendingabortkey, chatcompl_adapter
 
+    sdgendefaults = sd_parse_meta_field(args.sdgendefaults or '', config=True)
+    params = dict()
+    defparams = dict()
+    for k, v in sdgendefaults.items():
+        if k in ['sampler_name', 'scheduler']:
+            # these can be explicitely set to 'default'; process later
+            # TODO should we consider values like 'clip_skip=-1' as 'default' too?
+            defparams[k] = v
+        else:
+            params[k] = v
+    # apply most of the defaults
+    params.update(genparams)
+    genparams = params
+
     default_adapter = {} if chatcompl_adapter is None else chatcompl_adapter
     adapter_obj = genparams.get('adapter', default_adapter)
     forced_negprompt = adapter_obj.get("add_sd_negative_prompt", "")
@@ -1816,13 +1870,20 @@ def sd_generate(genparams):
     flip_mask = genparams.get("inpainting_mask_invert", 0)
     denoising_strength = tryparsefloat(genparams.get("denoising_strength", 0.6),0.6)
     cfg_scale = tryparsefloat(genparams.get("cfg_scale", 5),5)
+    distilled_guidance = tryparsefloat(genparams.get("distilled_guidance", None), None)
+    shifted_timestep = tryparseint(genparams.get("shifted_timestep", None), None)
     sample_steps = tryparseint(genparams.get("steps", 20),20)
     width = tryparseint(genparams.get("width", 512),512)
     height = tryparseint(genparams.get("height", 512),512)
     seed = tryparseint(genparams.get("seed", -1),-1)
     if seed < 0:
         seed = random.randint(100000, 999999)
-    sample_method = genparams.get("sampler_name", "k_euler_a")
+    sample_method = (genparams.get("sampler_name") or "default").lower()
+    if sample_method == 'default' and 'sampler_name' in defparams:
+        sample_method = (defparams.get("sampler_name") or "default").lower()
+    scheduler = (genparams.get("scheduler") or "default").lower()
+    if scheduler == 'default' and 'scheduler' in defparams:
+        scheduler = (defparams.get("scheduler") or "default").lower()
     clip_skip = tryparseint(genparams.get("clip_skip", -1),-1)
     vid_req_frames = tryparseint(genparams.get("frames", 1),1)
     vid_req_frames = 1 if (not vid_req_frames or vid_req_frames < 1) else vid_req_frames
@@ -1834,6 +1895,10 @@ def sd_generate(genparams):
 
     #clean vars
     cfg_scale = (1 if cfg_scale < 1 else (25 if cfg_scale > 25 else cfg_scale))
+    if distilled_guidance is not None and (distilled_guidance < 0 or distilled_guidance > 100):
+        distilled_guidance = None # fall back to the default
+    if shifted_timestep is not None and (shifted_timestep < 0 or shifted_timestep > 1000):
+        shifted_timestep = None # fall back to the default
     sample_steps = (1 if sample_steps < 1 else (forced_steplimit if sample_steps > forced_steplimit else sample_steps))
     vid_req_frames = (1 if vid_req_frames < 1 else (100 if vid_req_frames > 100 else vid_req_frames))
 
@@ -1852,12 +1917,17 @@ def sd_generate(genparams):
         inputs.extra_images[n] = extra_image.encode("UTF-8")
     inputs.flip_mask = flip_mask
     inputs.cfg_scale = cfg_scale
+    if distilled_guidance is not None:
+        inputs.distilled_guidance = distilled_guidance
     inputs.denoising_strength = denoising_strength
+    if shifted_timestep is not None:
+        inputs.shifted_timestep = shifted_timestep
     inputs.sample_steps = sample_steps
     inputs.width = width
     inputs.height = height
     inputs.seed = seed
-    inputs.sample_method = sample_method.lower().encode("UTF-8")
+    inputs.sample_method = sample_method.encode("UTF-8")
+    inputs.scheduler = scheduler.encode("UTF-8")
     inputs.clip_skip = clip_skip
     inputs.vid_req_frames = vid_req_frames
     inputs.vid_req_avi = vid_req_avi
@@ -4675,6 +4745,7 @@ def hide_tooltip(event):
     sd_clamped_soft_var = ctk.StringVar(value="0")
     sd_threads_var = ctk.StringVar(value=str(default_threads))
     sd_quant_var = ctk.StringVar(value=sd_quant_choices[0])
+    sd_gen_defaults_var = ctk.StringVar()
 
     whisper_model_var = ctk.StringVar()
     tts_model_var = ctk.StringVar()
@@ -5451,6 +5522,7 @@ def toggletaesd(a,b,c):
     makecheckbox(images_tab, "Model CPU Offload", sd_offload_cpu_var, 50,padx=8, tooltiptxt="Offload image weights in RAM to save VRAM, swap into VRAM when needed.")
     makecheckbox(images_tab, "VAE on CPU", sd_vae_cpu_var, 50,padx=160, tooltiptxt="Force VAE to CPU only for image generation.")
     makecheckbox(images_tab, "CLIP on GPU", sd_clip_gpu_var, 50,padx=280, tooltiptxt="Put CLIP and T5 to GPU for image generation. Otherwise, CLIP will use CPU.")
+    makelabelentry(images_tab, "Default Params:", sd_gen_defaults_var, 52, 280, padx=110, singleline=True, tooltip='Default image generation parameters when not specified by the UI or API.\nSpecified as JSON fields: {"KEY1":"VALUE1", "KEY2":"VALUE2"...}')
 
     # audio tab
     audio_tab = tabcontent["Audio"]
@@ -5725,6 +5797,7 @@ def export_vars():
             args.sdloramult = float(sd_loramult_var.get())
         else:
             args.sdlora = ""
+        args.sdgendefaults = sd_gen_defaults_var.get()
 
         if whisper_model_var.get() != "":
             args.whispermodel = whisper_model_var.get()
@@ -5951,6 +6024,7 @@ def import_vars(dict):
 
         sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "")
         sd_loramult_var.set(str(dict["sdloramult"]) if ("sdloramult" in dict and dict["sdloramult"]) else "1.0")
+        sd_gen_defaults_var.set(dict.get("sdgendefaults", ""))
 
         whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
 
@@ -7787,6 +7861,7 @@ def range_checker(arg: str):
     sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="")
     sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0)
     sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
+    sdparsergroup.add_argument("--sdgendefaults", metavar=('{"parameter":"value",...}'), help="Sets default parameters for image generation, as a JSON string.", default="")
     whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
     whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
 
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -67,8 +67,11 @@ struct SDParams {
     int width         = 512;
     int height        = 512;
 
-    sample_method_t sample_method = EULER_A;
+    sample_method_t sample_method = SAMPLE_METHOD_DEFAULT;
+    scheduler_t scheduler         = scheduler_t::DEFAULT;
     int sample_steps              = 20;
+    float distilled_guidance      = -1.0f;
+    float shifted_timestep        = 0;
     float strength                = 0.75f;
     int64_t seed                  = 42;
     bool clip_on_cpu              = false;
@@ -404,20 +407,24 @@ std::string clean_input_prompt(const std::string& input) {
 }
 
 static std::string get_image_params(const sd_img_gen_params_t & params) {
-    std::stringstream parameter_string;
-    parameter_string << std::setprecision(3)
+    std::stringstream ss;
+    ss << std::setprecision(3)
         <<    "Prompt: " << params.prompt
         << " | NegativePrompt: " << params.negative_prompt
         << " | Steps: " << params.sample_params.sample_steps
         << " | CFGScale: " << params.sample_params.guidance.txt_cfg
         << " | Guidance: " << params.sample_params.guidance.distilled_guidance
         << " | Seed: " << params.seed
         << " | Size: " << params.width << "x" << params.height
-        << " | Sampler: " << sd_sample_method_name(params.sample_params.sample_method)
-        << " | Clip skip: " << params.clip_skip
+        << " | Sampler: " << sd_sample_method_name(params.sample_params.sample_method);
+    if (params.sample_params.scheduler != scheduler_t::DEFAULT)
+        ss << " " << sd_schedule_name(params.sample_params.scheduler);
+    if (params.sample_params.shifted_timestep != 0)
+        ss << "| Timestep Shift: " << params.sample_params.shifted_timestep;
+    ss  << " | Clip skip: " << params.clip_skip
         << " | Model: " << sdmodelfilename
         << " | Version: KoboldCpp";
-    return parameter_string.str();
+    return ss.str();
 }
 
 static inline int rounddown_64(int n) {
@@ -519,23 +526,29 @@ static void sd_fix_resolution(int &width, int &height, int img_hard_limit, int i
 
 static enum sample_method_t sampler_from_name(const std::string& sampler)
 {
-    if(sampler=="euler a"||sampler=="k_euler_a"||sampler=="euler_a") //all lowercase
+    // all lowercase
+    enum sample_method_t result = str_to_sample_method(sampler.c_str());
+    if (result != sample_method_t::SAMPLE_METHOD_COUNT)
+    {
+        return result;
+    }
+    else if(sampler=="euler a"||sampler=="k_euler_a")
     {
         return sample_method_t::EULER_A;
     }
-    else if(sampler=="euler"||sampler=="k_euler")
+    else if(sampler=="k_euler")
     {
         return sample_method_t::EULER;
     }
-    else if(sampler=="heun"||sampler=="k_heun")
+    else if(sampler=="k_heun")
     {
         return sample_method_t::HEUN;
     }
-    else if(sampler=="dpm2"||sampler=="k_dpm_2")
+    else if(sampler=="k_dpm_2")
     {
         return sample_method_t::DPM2;
     }
-    else if(sampler=="lcm"||sampler=="k_lcm")
+    else if(sampler=="k_lcm")
     {
         return sample_method_t::LCM;
     }
@@ -549,11 +562,10 @@ static enum sample_method_t sampler_from_name(const std::string& sampler)
     }
     else
     {
-        return sample_method_t::EULER_A;
+        return sample_method_t::SAMPLE_METHOD_DEFAULT;
     }
 }
 
-
 uint8_t* load_image_from_b64(const std::string & b64str, int& width, int& height, int expected_width = 0, int expected_height = 0, int expected_channel = 3)
 {
     std::vector<uint8_t> decoded_buf = kcpp_base64_decode(b64str);
@@ -644,6 +656,19 @@ uint8_t* load_image_from_b64(const std::string & b64str, int& width, int& height
         image_buffer = resized_image_buffer;
     }
     return image_buffer;
+
+}
+
+static enum scheduler_t scheduler_from_name(const char * scheduler)
+{
+    if (scheduler) {
+        enum scheduler_t result = str_to_schedule(scheduler);
+        if (result != scheduler_t::SCHEDULE_COUNT)
+        {
+            return result;
+        }
+    }
+    return scheduler_t::DEFAULT;
 }
 
 sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
@@ -674,13 +699,20 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     sd_params->prompt = cleanprompt;
     sd_params->negative_prompt = cleannegprompt;
     sd_params->cfg_scale = inputs.cfg_scale;
+    sd_params->distilled_guidance = inputs.distilled_guidance;
     sd_params->sample_steps = inputs.sample_steps;
+    sd_params->shifted_timestep = inputs.shifted_timestep;
     sd_params->seed = inputs.seed;
     sd_params->width = inputs.width;
     sd_params->height = inputs.height;
     sd_params->strength = inputs.denoising_strength;
     sd_params->clip_skip = inputs.clip_skip;
     sd_params->sample_method = sampler_from_name(inputs.sample_method);
+    sd_params->scheduler = scheduler_from_name(inputs.scheduler);
+
+    if (sd_params->sample_method == SAMPLE_METHOD_DEFAULT) {
+        sd_params->sample_method = sd_get_default_sample_method(sd_ctx);
+    }
 
     auto loadedsdver = get_loaded_sd_version(sd_ctx);
     bool is_img2img = img2img_data != "";
@@ -841,10 +873,15 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     params.clip_skip = sd_params->clip_skip;
     params.sample_params.guidance.txt_cfg = sd_params->cfg_scale;
     params.sample_params.guidance.img_cfg = sd_params->cfg_scale;
+    if (sd_params->distilled_guidance >= 0.f) {
+        params.sample_params.guidance.distilled_guidance = sd_params->distilled_guidance;
+    }
     params.width = sd_params->width;
     params.height = sd_params->height;
     params.sample_params.sample_method = sd_params->sample_method;
+    params.sample_params.scheduler = sd_params->scheduler;
     params.sample_params.sample_steps = sd_params->sample_steps;
+    params.sample_params.shifted_timestep = sd_params->shifted_timestep;
     params.seed = sd_params->seed;
     params.strength = sd_params->strength;
     params.vae_tiling_params.enabled = dotile;
@@ -922,6 +959,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                 << "\nCFGSCLE:" << params.sample_params.guidance.txt_cfg
                 << "\nSIZE:" << params.width << "x" << params.height
                 << "\nSM:" << sd_sample_method_name(params.sample_params.sample_method)
+                << "\nSCHED:" << sd_schedule_name(params.sample_params.scheduler)
                 << "\nSTEP:" << params.sample_params.sample_steps
                 << "\nSEED:" << params.seed
                 << "\nBATCH:" << params.batch_count