Skip to content

Commit 4e40f2a

Browse files
committed
added photomaker face cloning
1 parent 21881a8 commit 4e40f2a

File tree

6 files changed

+161
-57
lines changed

6 files changed

+161
-57
lines changed

expose.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ struct sd_load_model_inputs
169169
const char * vae_filename = nullptr;
170170
const char * lora_filename = nullptr;
171171
const float lora_multiplier = 1.0f;
172+
const char * photomaker_filename = nullptr;
172173
const int img_hard_limit = 0;
173174
const int img_soft_limit = 0;
174175
const bool quiet = false;
@@ -180,6 +181,7 @@ struct sd_generation_inputs
180181
const char * negative_prompt = nullptr;
181182
const char * init_images = "";
182183
const char * mask = "";
184+
const char * photomaker_image = "";
183185
const bool flip_mask = false;
184186
const float denoising_strength = 0.0f;
185187
const float cfg_scale = 0.0f;

kcpp_sdui.embd

Lines changed: 32 additions & 32 deletions
Large diffs are not rendered by default.

koboldcpp.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ class sd_load_model_inputs(ctypes.Structure):
279279
("vae_filename", ctypes.c_char_p),
280280
("lora_filename", ctypes.c_char_p),
281281
("lora_multiplier", ctypes.c_float),
282+
("photomaker_filename", ctypes.c_char_p),
282283
("img_hard_limit", ctypes.c_int),
283284
("img_soft_limit", ctypes.c_int),
284285
("quiet", ctypes.c_bool),
@@ -289,6 +290,7 @@ class sd_generation_inputs(ctypes.Structure):
289290
("negative_prompt", ctypes.c_char_p),
290291
("init_images", ctypes.c_char_p),
291292
("mask", ctypes.c_char_p),
293+
("photomaker_image", ctypes.c_char_p),
292294
("flip_mask", ctypes.c_bool),
293295
("denoising_strength", ctypes.c_float),
294296
("cfg_scale", ctypes.c_float),
@@ -657,6 +659,13 @@ def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE
657659
return True #incomplete sequence
658660
return False #invalid sequence, but not incomplete
659661

662+
def strip_base64_prefix(encoded_data):
663+
if not encoded_data:
664+
return ""
665+
if encoded_data.startswith("data:image"):
666+
encoded_data = encoded_data.split(',', 1)[-1]
667+
return encoded_data
668+
660669
def unpack_to_dir(destpath = ""):
661670
srcpath = os.path.abspath(os.path.dirname(__file__))
662671
cliunpack = False if destpath == "" else True
@@ -1523,7 +1532,7 @@ def generate(genparams, stream_flag=False):
15231532
return {"text":outstr,"status":ret.status,"stopreason":ret.stopreason,"prompt_tokens":ret.prompt_tokens, "completion_tokens": ret.completion_tokens}
15241533

15251534

1526-
def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename):
1535+
def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
15271536
global args
15281537
inputs = sd_load_model_inputs()
15291538
inputs.model_filename = model_filename.encode("UTF-8")
@@ -1547,6 +1556,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
15471556
inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8")
15481557
inputs.clipl_filename = clipl_filename.encode("UTF-8")
15491558
inputs.clipg_filename = clipg_filename.encode("UTF-8")
1559+
inputs.photomaker_filename = photomaker_filename.encode("UTF-8")
15501560
inputs.img_hard_limit = args.sdclamped
15511561
inputs.img_soft_limit = args.sdclampedsoft
15521562
inputs = set_backend_props(inputs)
@@ -1617,7 +1627,8 @@ def sd_generate(genparams):
16171627
prompt = forced_posprompt
16181628
init_images_arr = genparams.get("init_images", [])
16191629
init_images = ("" if (not init_images_arr or len(init_images_arr)==0 or not init_images_arr[0]) else init_images_arr[0])
1620-
mask = genparams.get("mask", "")
1630+
init_images = strip_base64_prefix(init_images)
1631+
mask = strip_base64_prefix(genparams.get("mask", ""))
16211632
flip_mask = genparams.get("inpainting_mask_invert", 0)
16221633
denoising_strength = tryparsefloat(genparams.get("denoising_strength", 0.6),0.6)
16231634
cfg_scale = tryparsefloat(genparams.get("cfg_scale", 5),5)
@@ -1629,6 +1640,7 @@ def sd_generate(genparams):
16291640
seed = random.randint(100000, 999999)
16301641
sample_method = genparams.get("sampler_name", "k_euler_a")
16311642
clip_skip = tryparseint(genparams.get("clip_skip", -1),-1)
1643+
photomaker_image = strip_base64_prefix(genparams.get("photomaker_image", ""))
16321644

16331645
#clean vars
16341646
cfg_scale = (1 if cfg_scale < 1 else (25 if cfg_scale > 25 else cfg_scale))
@@ -1642,6 +1654,7 @@ def sd_generate(genparams):
16421654
inputs.negative_prompt = negative_prompt.encode("UTF-8")
16431655
inputs.init_images = init_images.encode("UTF-8")
16441656
inputs.mask = "".encode("UTF-8") if not mask else mask.encode("UTF-8")
1657+
inputs.photomaker_image = "".encode("UTF-8") if not photomaker_image else photomaker_image.encode("UTF-8")
16451658
inputs.flip_mask = flip_mask
16461659
inputs.cfg_scale = cfg_scale
16471660
inputs.denoising_strength = denoising_strength
@@ -4288,6 +4301,7 @@ def hide_tooltip(event):
42884301
sd_t5xxl_var = ctk.StringVar()
42894302
sd_clipl_var = ctk.StringVar()
42904303
sd_clipg_var = ctk.StringVar()
4304+
sd_photomaker_var = ctk.StringVar()
42914305
sd_vaeauto_var = ctk.IntVar(value=0)
42924306
sd_notile_var = ctk.IntVar(value=0)
42934307
sd_clamped_var = ctk.StringVar(value="0")
@@ -5002,13 +5016,12 @@ def togglehorde(a,b,c):
50025016
makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
50035017
makelabelentry(images_tab, "Image LoRA Multiplier:" , sd_loramult_var, 22, 50,padx=290,singleline=True,tooltip="What mutiplier value to apply the SD LoRA with.")
50045018

5005-
5006-
50075019
makefileentry(images_tab, "T5-XXL File:", "Select Optional T5-XXL model file (SD3 or flux)",sd_t5xxl_var, 24, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
50085020
makefileentry(images_tab, "Clip-L File:", "Select Optional Clip-L model file (SD3 or flux)",sd_clipl_var, 26, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
50095021
makefileentry(images_tab, "Clip-G File:", "Select Optional Clip-G model file (SD3)",sd_clipg_var, 28, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
5022+
makefileentry(images_tab, "PhotoMaker:", "Select Optional PhotoMaker model file (SDXL)",sd_photomaker_var, 30, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="PhotoMaker is a model that allows face cloning.\nSelect a .safetensors PhotoMaker file to be loaded (SDXL only).")
50105023

5011-
sdvaeitem1,sdvaeitem2,sdvaeitem3 = makefileentry(images_tab, "Image VAE:", "Select Optional SD VAE file",sd_vae_var, 30, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD VAE file to be loaded.")
5024+
sdvaeitem1,sdvaeitem2,sdvaeitem3 = makefileentry(images_tab, "Image VAE:", "Select Optional SD VAE file",sd_vae_var, 40, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD VAE file to be loaded.")
50125025
def toggletaesd(a,b,c):
50135026
if sd_vaeauto_var.get()==1:
50145027
sdvaeitem1.grid_remove()
@@ -5019,8 +5032,8 @@ def toggletaesd(a,b,c):
50195032
sdvaeitem1.grid()
50205033
sdvaeitem2.grid()
50215034
sdvaeitem3.grid()
5022-
makecheckbox(images_tab, "Use TAE SD (AutoFix Broken VAE)", sd_vaeauto_var, 32,command=toggletaesd,tooltiptxt="Replace VAE with TAESD. May fix bad VAE.")
5023-
makecheckbox(images_tab, "No VAE Tiling", sd_notile_var, 34,tooltiptxt="Disables VAE tiling, may not work for large images.")
5035+
makecheckbox(images_tab, "Use TAE SD (AutoFix Broken VAE)", sd_vaeauto_var, 42,command=toggletaesd,tooltiptxt="Replace VAE with TAESD. May fix bad VAE.")
5036+
makecheckbox(images_tab, "No VAE Tiling", sd_notile_var, 44,tooltiptxt="Disables VAE tiling, may not work for large images.")
50245037

50255038
# audio tab
50265039
audio_tab = tabcontent["Audio"]
@@ -5268,6 +5281,8 @@ def export_vars():
52685281
args.sdclipl = sd_clipl_var.get()
52695282
if sd_clipg_var.get() != "":
52705283
args.sdclipg = sd_clipg_var.get()
5284+
if sd_photomaker_var.get() != "":
5285+
args.sdphotomaker = sd_photomaker_var.get()
52715286
if sd_quant_var.get()==1:
52725287
args.sdquant = True
52735288
if sd_lora_var.get() != "":
@@ -5471,6 +5486,7 @@ def import_vars(dict):
54715486
sd_t5xxl_var.set(dict["sdt5xxl"] if ("sdt5xxl" in dict and dict["sdt5xxl"]) else "")
54725487
sd_clipl_var.set(dict["sdclipl"] if ("sdclipl" in dict and dict["sdclipl"]) else "")
54735488
sd_clipg_var.set(dict["sdclipg"] if ("sdclipg" in dict and dict["sdclipg"]) else "")
5489+
sd_photomaker_var.set(dict["sdphotomaker"] if ("sdphotomaker" in dict and dict["sdphotomaker"]) else "")
54745490
sd_vaeauto_var.set(1 if ("sdvaeauto" in dict and dict["sdvaeauto"]) else 0)
54755491
sd_notile_var.set(1 if ("sdnotile" in dict and dict["sdnotile"]) else 0)
54765492
sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "")
@@ -6509,6 +6525,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
65096525
dlfile = download_model_from_url(args.sdclipg,[".gguf",".safetensors"],min_file_size=500000)
65106526
if dlfile:
65116527
args.sdclipg = dlfile
6528+
if args.sdphotomaker and args.sdphotomaker!="":
6529+
dlfile = download_model_from_url(args.sdphotomaker,[".gguf",".safetensors"],min_file_size=500000)
6530+
if dlfile:
6531+
args.sdphotomaker = dlfile
65126532
if args.sdvae and args.sdvae!="":
65136533
dlfile = download_model_from_url(args.sdvae,[".gguf",".safetensors"],min_file_size=500000)
65146534
if dlfile:
@@ -6785,6 +6805,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
67856805
imgt5xxl = ""
67866806
imgclipl = ""
67876807
imgclipg = ""
6808+
imgphotomaker = ""
67886809
if args.sdlora:
67896810
if os.path.exists(args.sdlora):
67906811
imglora = os.path.abspath(args.sdlora)
@@ -6810,13 +6831,18 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
68106831
imgclipg = os.path.abspath(args.sdclipg)
68116832
else:
68126833
print("Missing SD Clip-G model file...")
6834+
if args.sdphotomaker:
6835+
if os.path.exists(args.sdphotomaker):
6836+
imgphotomaker = os.path.abspath(args.sdphotomaker)
6837+
else:
6838+
print("Missing SD Photomaker model file...")
68136839

68146840
imgmodel = os.path.abspath(imgmodel)
68156841
fullsdmodelpath = imgmodel
68166842
friendlysdmodelname = os.path.basename(imgmodel)
68176843
friendlysdmodelname = os.path.splitext(friendlysdmodelname)[0]
68186844
friendlysdmodelname = sanitize_string(friendlysdmodelname)
6819-
loadok = sd_load_model(imgmodel,imgvae,imglora,imgt5xxl,imgclipl,imgclipg)
6845+
loadok = sd_load_model(imgmodel,imgvae,imglora,imgt5xxl,imgclipl,imgclipg,imgphotomaker)
68206846
print("Load Image Model OK: " + str(loadok))
68216847
if not loadok:
68226848
exitcounter = 999
@@ -7235,6 +7261,7 @@ def range_checker(arg: str):
72357261
sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
72367262
sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
72377263
sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="")
7264+
sdparsergroup.add_argument("--sdphotomaker", metavar=('[filename]'), help="PhotoMaker is a model that allows face cloning. Specify a PhotoMaker safetensors model which will be applied replacing img2img. SDXL models only. Leave blank if unused.", default="")
72387265
sdparsergroupvae = sdparsergroup.add_mutually_exclusive_group()
72397266
sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
72407267
sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')

otherarch/sdcpp/conditioner.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
597597
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
598598
tokens.erase(it);
599599
return decode(tokens);
600+
//return prompt; //kcpp we don't care about photomaker trigger words
600601
}
601602

602603
SDCondition get_learned_condition(ggml_context* work_ctx,

otherarch/sdcpp/sdtype_adapter.cpp

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ struct SDParams {
5757
std::string controlnet_path;
5858
std::string embeddings_path;
5959
std::string stacked_id_embeddings_path;
60-
std::string input_id_images_path;
60+
std::string input_id_images_path = "";
6161
sd_type_t wtype = SD_TYPE_COUNT;
6262
std::string lora_model_dir;
6363
std::string output_path = "output.png";
@@ -116,6 +116,7 @@ static int sddebugmode = 0;
116116
static std::string recent_data = "";
117117
static uint8_t * input_image_buffer = NULL;
118118
static uint8_t * input_mask_buffer = NULL;
119+
static uint8_t * input_photomaker_buffer = NULL;
119120

120121
static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
121122
static bool notiling = false;
@@ -134,6 +135,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
134135
std::string t5xxl_filename = inputs.t5xxl_filename;
135136
std::string clipl_filename = inputs.clipl_filename;
136137
std::string clipg_filename = inputs.clipg_filename;
138+
std::string photomaker_filename = inputs.photomaker_filename;
137139
notiling = inputs.notile;
138140
cfg_side_limit = inputs.img_hard_limit;
139141
cfg_square_limit = inputs.img_soft_limit;
@@ -164,6 +166,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
164166
{
165167
printf("With Custom Clip-G Model: %s\n",clipg_filename.c_str());
166168
}
169+
if(photomaker_filename!="")
170+
{
171+
printf("With PhotoMaker Model: %s\n",photomaker_filename.c_str());
172+
}
167173
if(inputs.quant)
168174
{
169175
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -205,6 +211,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
205211
sd_params->t5xxl_path = t5xxl_filename;
206212
sd_params->clip_l_path = clipl_filename;
207213
sd_params->clip_g_path = clipg_filename;
214+
sd_params->stacked_id_embeddings_path = photomaker_filename;
208215
//if t5 is set, and model is a gguf, load it as a diffusion model path
209216
bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5);
210217
if(sd_params->t5xxl_path!="" && endswithgguf)
@@ -423,6 +430,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
423430
std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt);
424431
std::string img2img_data = std::string(inputs.init_images);
425432
std::string img2img_mask = std::string(inputs.mask);
433+
std::string photomaker_image_data = std::string(inputs.photomaker_image);
426434
std::string sampler = inputs.sample_method;
427435

428436
sd_params->prompt = cleanprompt;
@@ -490,15 +498,17 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
490498

491499
//for img2img
492500
sd_image_t input_image = {0,0,0,nullptr};
501+
sd_image_t photomaker_reference = {0,0,0,nullptr};
493502
std::vector<uint8_t> image_buffer;
494503
std::vector<uint8_t> image_mask_buffer;
504+
std::vector<uint8_t> photomaker_buffer;
495505
int nx, ny, nc;
496-
int nx2, ny2, nc2;
497506
int img2imgW = sd_params->width; //for img2img input
498507
int img2imgH = sd_params->height;
499508
int img2imgC = 3; // Assuming RGB image
500509
std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC);
501510
std::vector<uint8_t> resized_mask_buf(img2imgW * img2imgH * img2imgC);
511+
std::vector<uint8_t> resized_photomaker_buf(img2imgW * img2imgH * img2imgC);
502512

503513
std::string ts = get_timestamp_str();
504514
if(!sd_is_quiet)
@@ -543,6 +553,35 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
543553
sd_params->sample_method = sample_method_t::EULER_A;
544554
}
545555

556+
if(photomaker_image_data!="")
557+
{
558+
if(input_photomaker_buffer!=nullptr) //just in time free old buffer
559+
{
560+
stbi_image_free(input_photomaker_buffer);
561+
input_photomaker_buffer = nullptr;
562+
}
563+
int nx2, ny2, nc2;
564+
photomaker_buffer = kcpp_base64_decode(photomaker_image_data);
565+
input_photomaker_buffer = stbi_load_from_memory(photomaker_buffer.data(), photomaker_buffer.size(), &nx2, &ny2, &nc2, 1);
566+
// Resize the image
567+
int resok = stbir_resize_uint8(input_photomaker_buffer, nx2, ny2, 0, resized_photomaker_buf.data(), img2imgW, img2imgH, 0, 1);
568+
if (!resok) {
569+
printf("\nKCPP SD: resize photomaker image failed!\n");
570+
output.data = "";
571+
output.status = 0;
572+
return output;
573+
}
574+
photomaker_reference.width = img2imgW;
575+
photomaker_reference.height = img2imgH;
576+
photomaker_reference.channel = img2imgC;
577+
photomaker_reference.data = resized_photomaker_buf.data();
578+
579+
//ensure prompt has img keyword, otherwise append it
580+
if (sd_params->prompt.find("img") == std::string::npos) {
581+
sd_params->prompt += " img";
582+
}
583+
}
584+
546585
if (sd_params->mode == TXT2IMG) {
547586

548587
if(!sd_is_quiet && sddebugmode==1)
@@ -585,7 +624,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
585624
sd_params->skip_layers.size(),
586625
sd_params->slg_scale,
587626
sd_params->skip_layer_start,
588-
sd_params->skip_layer_end);
627+
sd_params->skip_layer_end,
628+
(photomaker_image_data!=""?&photomaker_reference:nullptr));
589629
} else {
590630

591631
if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@@ -596,18 +636,11 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
596636
}
597637

598638
image_buffer = kcpp_base64_decode(img2img_data);
599-
600639
if(input_image_buffer!=nullptr) //just in time free old buffer
601640
{
602641
stbi_image_free(input_image_buffer);
603642
input_image_buffer = nullptr;
604643
}
605-
if(input_mask_buffer!=nullptr) //just in time free old buffer
606-
{
607-
stbi_image_free(input_mask_buffer);
608-
input_mask_buffer = nullptr;
609-
}
610-
611644
input_image_buffer = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &nx, &ny, &nc, 3);
612645

613646
if (nx < 64 || ny < 64 || nx > 1024 || ny > 1024 || nc!= 3) {
@@ -634,6 +667,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
634667

635668
if(img2img_mask!="")
636669
{
670+
int nx2, ny2, nc2;
671+
if(input_mask_buffer!=nullptr) //just in time free old buffer
672+
{
673+
stbi_image_free(input_mask_buffer);
674+
input_mask_buffer = nullptr;
675+
}
637676
image_mask_buffer = kcpp_base64_decode(img2img_mask);
638677
input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 1);
639678
// Resize the image
@@ -709,7 +748,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
709748
sd_params->skip_layers.size(),
710749
sd_params->slg_scale,
711750
sd_params->skip_layer_start,
712-
sd_params->skip_layer_end);
751+
sd_params->skip_layer_end,
752+
(photomaker_image_data!=""?&photomaker_reference:nullptr));
713753
}
714754

715755
if (results == NULL) {

0 commit comments

Comments
 (0)