@@ -1073,7 +1073,7 @@ struct FluxCLIPEmbedder : public Conditioner {
10731073 return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}};
10741074 }
10751075
1076- SDCondition get_learned_condition_common (ggml_context* work_ctx,
1076+ SDCondition get_learned_condition_common (ggml_context* work_ctx,
10771077 int n_threads,
10781078 std::vector<std::pair<std::vector<int >, std::vector<float >>> token_and_weights,
10791079 int clip_skip,
@@ -1084,100 +1084,62 @@ struct FluxCLIPEmbedder : public Conditioner {
10841084 auto & t5_tokens = token_and_weights[1 ].first ;
10851085 auto & t5_weights = token_and_weights[1 ].second ;
10861086
1087- int64_t t0 = ggml_time_ms ();
1088- struct ggml_tensor * hidden_states = NULL ; // [N, n_token, 4096]
1089- struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token*2 , 4096]
1090- struct ggml_tensor * pooled = NULL ; // [768,]
1087+ int64_t t0 = ggml_time_ms ();
1088+ struct ggml_tensor * hidden_states = NULL ; // [N, n_token, 4096]
1089+ struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token, 4096]
1090+ struct ggml_tensor * pooled = NULL ; // [768,]
10911091 std::vector<float > hidden_states_vec;
10921092
1093- size_t chunk_len_l = 77 ;
1094- size_t chunk_count_l = clip_l_tokens.size () / chunk_len_l;
1095-
1096- size_t chunk_len_t5 = 256 ;
1097- size_t chunk_count_t5 = t5_tokens.size () / chunk_len_t5;
1098-
1099- // TODO: I believe chunk_count_l is actually bigger than chunk_count_t5
1100- // So this ignores some tokens for clip
1101- size_t chunk_count = chunk_count_t5;
1102-
1093+ size_t chunk_len = 256 ;
1094+ size_t chunk_count = t5_tokens.size () / chunk_len;
11031095 for (int chunk_idx = 0 ; chunk_idx < chunk_count; chunk_idx++) {
1104- struct ggml_tensor * chunk_hidden_states_l = NULL ; // [n_token, hidden_size_l]
1105- struct ggml_tensor * chunk_hidden_states_t5 = NULL ; // [n_token, hidden_size_t5]
11061096 // clip_l
1107- if (chunk_idx < chunk_count_l) {
1108- std::vector<int > chunk_tokens (clip_l_tokens.begin () + chunk_idx * chunk_len_l,
1109- clip_l_tokens.begin () + (chunk_idx + 1 ) * chunk_len_l);
1110- std::vector<float > chunk_weights (clip_l_weights.begin () + chunk_idx * chunk_len_l,
1111- clip_l_weights.begin () + (chunk_idx + 1 ) * chunk_len_l);
1097+ if (chunk_idx == 0 ) {
1098+ size_t chunk_len_l = 77 ;
1099+ std::vector<int > chunk_tokens (clip_l_tokens.begin (),
1100+ clip_l_tokens.begin () + chunk_len_l);
1101+ std::vector<float > chunk_weights (clip_l_weights.begin (),
1102+ clip_l_weights.begin () + chunk_len_l);
11121103
11131104 auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
11141105 size_t max_token_idx = 0 ;
11151106
1107+ auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
1108+ max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
1109+ LOG_INFO (" max_token_idx = %d" ,max_token_idx);
1110+
11161111 clip_l->compute (n_threads,
11171112 input_ids,
11181113 0 ,
11191114 NULL ,
11201115 max_token_idx,
1121- false ,
1122- &chunk_hidden_states_l ,
1116+ true ,
1117+ &pooled ,
11231118 work_ctx);
1124- {
1125- auto tensor = chunk_hidden_states_l;
1126- float original_mean = ggml_tensor_mean (tensor);
1127- for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1128- for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1129- for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1130- float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1131- value *= chunk_weights[i1];
1132- ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1133- }
1134- }
1135- }
1136- float new_mean = ggml_tensor_mean (tensor);
1137- ggml_tensor_scale (tensor, (original_mean / new_mean));
1138- }
1139- if (chunk_idx == 0 ) {
1140- std::vector<int > chunk_tokens (clip_l_tokens.begin (),
1141- clip_l_tokens.begin () + chunk_len_l);
1142- std::vector<float > chunk_weights (clip_l_weights.begin (),
1143- clip_l_weights.begin () + chunk_len_l);
11441119
1145- auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1146- size_t max_token_idx = 0 ;
1120+ LOG_INFO (" pooled->ne = [%d, %d, %d, %d] " ,pooled->ne [0 ], pooled->ne [1 ], pooled->ne [2 ], pooled->ne [3 ]);
11471121
1148- // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
1149- // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
1150- // clip_l->compute(n_threads,
1151- // input_ids,
1152- // 0,
1153- // NULL,
1154- // max_token_idx,
1155- // true,
1156- // &pooled,
1157- // work_ctx);
1158-
1159- // clip_l.transformer.text_model.text_projection no in file, ignore
1160- // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
1161- pooled = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, 768 );
1162- ggml_set_f32 (pooled, 0 .f );
1163- }
1122+ // clip_l.transformer.text_model.text_projection no in file, ignore
1123+ // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
1124+ // pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
1125+ // ggml_set_f32(pooled, 0.f);
11641126 }
11651127
11661128 // t5
1167- if (chunk_idx < chunk_count_t5) {
1168- std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len_t5 ,
1169- t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len_t5 );
1170- std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len_t5 ,
1171- t5_weights.begin () + (chunk_idx + 1 ) * chunk_len_t5 );
1129+ {
1130+ std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len ,
1131+ t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len );
1132+ std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len ,
1133+ t5_weights.begin () + (chunk_idx + 1 ) * chunk_len );
11721134
11731135 auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
11741136
11751137 t5->compute (n_threads,
11761138 input_ids,
1177- &chunk_hidden_states_t5 ,
1139+ &chunk_hidden_states ,
11781140 work_ctx);
11791141 {
1180- auto tensor = chunk_hidden_states_t5 ;
1142+ auto tensor = chunk_hidden_states ;
11811143 float original_mean = ggml_tensor_mean (tensor);
11821144 for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
11831145 for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
@@ -1193,33 +1155,6 @@ struct FluxCLIPEmbedder : public Conditioner {
11931155 }
11941156 }
11951157
1196-
1197- // TODO: Maybe there's a better way to do the padding?
1198- auto chunk_hidden_states_l_pad = ggml_new_tensor_3d (work_ctx,
1199- chunk_hidden_states_l->type ,
1200- 4096 ,
1201- chunk_hidden_states_l->ne [1 ],
1202- chunk_hidden_states_l->ne [2 ]); // [n_token, 4096]
1203-
1204- for (int i2 = 0 ; i2 < chunk_hidden_states_l_pad->ne [2 ]; i2++) {
1205- for (int i1 = 0 ; i1 < chunk_hidden_states_l_pad->ne [1 ]; i1++) {
1206- for (int i0 = 0 ; i0 < chunk_hidden_states_l_pad->ne [0 ]; i0++) {
1207- float value = 0 .f ;
1208- if (i0 < chunk_hidden_states_l->ne [0 ]) {
1209- value = ggml_tensor_get_f32 (chunk_hidden_states_l, i0, i1, i2);
1210- }
1211- ggml_tensor_set_f32 (chunk_hidden_states_l_pad, value, i0, i1, i2);
1212- }
1213- }
1214- }
1215-
1216- if (chunk_hidden_states_t5 == NULL ){
1217- chunk_hidden_states = chunk_hidden_states_l_pad;
1218- } else {
1219- chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1 ); // [n_token*2, 4096]
1220- }
1221-
1222-
12231158 int64_t t1 = ggml_time_ms ();
12241159 LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
12251160 if (force_zero_embeddings) {
0 commit comments