@@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5757 std::vector<std::string> readed_embeddings;
5858
5959 FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend,
60- std::map<std::string, enum ggml_type> & tensor_types,
60+ const String2GGMLType & tensor_types,
6161 const std::string& embd_dir,
6262 SDVersion version = VERSION_SD1,
6363 PMVersion pv = PM_VERSION_1,
6464 int clip_skip = -1 )
6565 : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ), embd_dir(embd_dir) {
66- if (clip_skip <= 0 ) {
67- clip_skip = 1 ;
68- if (sd_version_is_sd2 (version) || sd_version_is_sdxl (version)) {
69- clip_skip = 2 ;
70- }
71- }
7266 if (sd_version_is_sd1 (version)) {
73- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip );
67+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14);
7468 } else if (sd_version_is_sd2 (version)) {
75- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, clip_skip );
69+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14);
7670 } else if (sd_version_is_sdxl (version)) {
77- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
78- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
71+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
72+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
7973 }
74+ set_clip_skip (clip_skip);
8075 }
8176
8277 void set_clip_skip (int clip_skip) {
78+ if (clip_skip <= 0 ) {
79+ clip_skip = 1 ;
80+ if (sd_version_is_sd2 (version) || sd_version_is_sdxl (version)) {
81+ clip_skip = 2 ;
82+ }
83+ }
8384 text_model->set_clip_skip (clip_skip);
8485 if (sd_version_is_sdxl (version)) {
8586 text_model2->set_clip_skip (clip_skip);
@@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
458459 if (sd_version_is_sdxl (version)) {
459460 text_model2->compute (n_threads,
460461 input_ids2,
461- 0 ,
462- NULL ,
462+ num_custom_embeddings ,
463+ token_embed_custom. data () ,
463464 max_token_idx,
464465 false ,
465466 &chunk_hidden_states2, work_ctx);
@@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
469470 if (chunk_idx == 0 ) {
470471 text_model2->compute (n_threads,
471472 input_ids2,
472- 0 ,
473- NULL ,
473+ num_custom_embeddings ,
474+ token_embed_custom. data () ,
474475 max_token_idx,
475476 true ,
476477 &pooled,
@@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
617618struct FrozenCLIPVisionEmbedder : public GGMLRunner {
618619 CLIPVisionModelProjection vision_model;
619620
620- FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type> & tensor_types)
621+ FrozenCLIPVisionEmbedder (ggml_backend_t backend, const String2GGMLType & tensor_types = {} )
621622 : vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend) {
622623 vision_model.init (params_ctx, tensor_types, " cond_stage_model.transformer" );
623624 }
@@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner {
662663 std::shared_ptr<T5Runner> t5;
663664
664665 SD3CLIPEmbedder (ggml_backend_t backend,
665- std::map<std::string, enum ggml_type> & tensor_types,
666- int clip_skip = -1 )
666+ const String2GGMLType & tensor_types = {} ,
667+ int clip_skip = -1 )
667668 : clip_g_tokenizer(0 ) {
668- if (clip_skip <= 0 ) {
669- clip_skip = 2 ;
670- }
671- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
672- clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
669+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
670+ clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
673671 t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
672+ set_clip_skip (clip_skip);
674673 }
675674
676675 void set_clip_skip (int clip_skip) {
676+ if (clip_skip <= 0 ) {
677+ clip_skip = 2 ;
678+ }
677679 clip_l->set_clip_skip (clip_skip);
678680 clip_g->set_clip_skip (clip_skip);
679681 }
@@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner {
10081010 size_t chunk_len = 256 ;
10091011
10101012 FluxCLIPEmbedder (ggml_backend_t backend,
1011- std::map<std::string, enum ggml_type>& tensor_types,
1012- int clip_skip = -1 ) {
1013- if (clip_skip <= 0 ) {
1014- clip_skip = 2 ;
1015- }
1016- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, true );
1013+ const String2GGMLType& tensor_types = {},
1014+ int clip_skip = -1 ) {
1015+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
10171016 t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1017+ set_clip_skip (clip_skip);
10181018 }
10191019
10201020 void set_clip_skip (int clip_skip) {
1021+ if (clip_skip <= 0 ) {
1022+ clip_skip = 2 ;
1023+ }
10211024 clip_l->set_clip_skip (clip_skip);
10221025 }
10231026
@@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
12281231 int mask_pad = 1 ;
12291232
12301233 PixArtCLIPEmbedder (ggml_backend_t backend,
1231- std::map<std::string, enum ggml_type> & tensor_types,
1232- int clip_skip = -1 ,
1233- bool use_mask = false ,
1234- int mask_pad = 1 )
1234+ const String2GGMLType & tensor_types = {} ,
1235+ int clip_skip = -1 ,
1236+ bool use_mask = false ,
1237+ int mask_pad = 1 )
12351238 : use_mask(use_mask), mask_pad(mask_pad) {
12361239 t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
12371240 }
@@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
14221425 }
14231426};
14241427
1425- #endif
1428+ #endif
0 commit comments