@@ -45,7 +45,6 @@ struct Conditioner {
4545struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
4646 SDVersion version = VERSION_SD1;
4747 CLIPTokenizer tokenizer;
48- ggml_type wtype;
4948 std::shared_ptr<CLIPTextModelRunner> text_model;
5049 std::shared_ptr<CLIPTextModelRunner> text_model2;
5150
@@ -56,24 +55,24 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5655 std::vector<std::string> readed_embeddings;
5756
5857 FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend,
59- ggml_type wtype ,
58+ std::map<std::string, enum ggml_type>& tensor_types ,
6059 const std::string& embd_dir,
6160 SDVersion version = VERSION_SD1,
6261 int clip_skip = -1 )
63- : version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir), wtype(wtype) {
62+ : version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir) {
6463 if (clip_skip <= 0 ) {
6564 clip_skip = 1 ;
6665 if (version == VERSION_SD2 || version == VERSION_SDXL) {
6766 clip_skip = 2 ;
6867 }
6968 }
7069 if (version == VERSION_SD1) {
71- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip);
70+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip);
7271 } else if (version == VERSION_SD2) {
73- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_H_14, clip_skip);
72+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPEN_CLIP_VIT_H_14, clip_skip);
7473 } else if (version == VERSION_SDXL) {
75- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, false );
76- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
74+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, false );
75+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
7776 }
7877 }
7978
@@ -136,14 +135,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
136135 LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
137136 return false ;
138137 }
139- embd = ggml_new_tensor_2d (embd_ctx, wtype , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
138+ embd = ggml_new_tensor_2d (embd_ctx, tensor_storage. type , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
140139 *dst_tensor = embd;
141140 return true ;
142141 };
143142 model_loader.load_tensors (on_load, NULL );
144143 readed_embeddings.push_back (embd_name);
145144 token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd));
146- memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (wtype )),
145+ memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (embd-> type )),
147146 embd->data ,
148147 ggml_nbytes (embd));
149148 for (int i = 0 ; i < embd->ne [1 ]; i++) {
@@ -585,9 +584,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
585584struct FrozenCLIPVisionEmbedder : public GGMLRunner {
586585 CLIPVisionModelProjection vision_model;
587586
588- FrozenCLIPVisionEmbedder (ggml_backend_t backend, ggml_type wtype )
589- : vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend, wtype ) {
590- vision_model.init (params_ctx, wtype );
587+ FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types )
588+ : vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend) {
589+ vision_model.init (params_ctx, tensor_types, " cond_stage_model.transformer " );
591590 }
592591
593592 std::string get_desc () {
@@ -622,7 +621,6 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
622621};
623622
624623struct SD3CLIPEmbedder : public Conditioner {
625- ggml_type wtype;
626624 CLIPTokenizer clip_l_tokenizer;
627625 CLIPTokenizer clip_g_tokenizer;
628626 T5UniGramTokenizer t5_tokenizer;
@@ -631,15 +629,15 @@ struct SD3CLIPEmbedder : public Conditioner {
631629 std::shared_ptr<T5Runner> t5;
632630
633631 SD3CLIPEmbedder (ggml_backend_t backend,
634- ggml_type wtype ,
632+ std::map<std::string, enum ggml_type>& tensor_types ,
635633 int clip_skip = -1 )
636- : wtype(wtype), clip_g_tokenizer(0 ) {
634+ : clip_g_tokenizer(0 ) {
637635 if (clip_skip <= 0 ) {
638636 clip_skip = 2 ;
639637 }
640- clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, false );
641- clip_g = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
642- t5 = std::make_shared<T5Runner>(backend, wtype );
638+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, false );
639+ clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
640+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer " );
643641 }
644642
645643 void set_clip_skip (int clip_skip) {
@@ -798,7 +796,7 @@ struct SD3CLIPEmbedder : public Conditioner {
798796 }
799797
800798 if (chunk_idx == 0 ) {
801- auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
799+ auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
802800 max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
803801 clip_l->compute (n_threads,
804802 input_ids,
@@ -808,7 +806,6 @@ struct SD3CLIPEmbedder : public Conditioner {
808806 true ,
809807 &pooled_l,
810808 work_ctx);
811-
812809 }
813810 }
814811
@@ -848,7 +845,7 @@ struct SD3CLIPEmbedder : public Conditioner {
848845 }
849846
850847 if (chunk_idx == 0 ) {
851- auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_g_tokenizer.EOS_TOKEN_ID );
848+ auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_g_tokenizer.EOS_TOKEN_ID );
852849 max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
853850 clip_g->compute (n_threads,
854851 input_ids,
@@ -858,7 +855,6 @@ struct SD3CLIPEmbedder : public Conditioner {
858855 true ,
859856 &pooled_g,
860857 work_ctx);
861-
862858 }
863859 }
864860
@@ -971,21 +967,19 @@ struct SD3CLIPEmbedder : public Conditioner {
971967};
972968
973969struct FluxCLIPEmbedder : public Conditioner {
974- ggml_type wtype;
975970 CLIPTokenizer clip_l_tokenizer;
976971 T5UniGramTokenizer t5_tokenizer;
977972 std::shared_ptr<CLIPTextModelRunner> clip_l;
978973 std::shared_ptr<T5Runner> t5;
979974
980975 FluxCLIPEmbedder (ggml_backend_t backend,
981- ggml_type wtype,
982- int clip_skip = -1 )
983- : wtype(wtype) {
976+ std::map<std::string, enum ggml_type>& tensor_types,
977+ int clip_skip = -1 ) {
984978 if (clip_skip <= 0 ) {
985979 clip_skip = 2 ;
986980 }
987- clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, true );
988- t5 = std::make_shared<T5Runner>(backend, wtype );
981+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, true );
982+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer " );
989983 }
990984
991985 void set_clip_skip (int clip_skip) {
@@ -1096,9 +1090,9 @@ struct FluxCLIPEmbedder : public Conditioner {
10961090 auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
10971091 size_t max_token_idx = 0 ;
10981092
1099- auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
1093+ auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
11001094 max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
1101-
1095+
11021096 clip_l->compute (n_threads,
11031097 input_ids,
11041098 0 ,
@@ -1107,7 +1101,6 @@ struct FluxCLIPEmbedder : public Conditioner {
11071101 true ,
11081102 &pooled,
11091103 work_ctx);
1110-
11111104 }
11121105
11131106 // t5
0 commit comments