Skip to content

Commit 8e33f42

Browse files
committed
Clip: Fixed for real this time, i swear
1 parent ea40b1c commit 8e33f42

File tree

2 files changed

+36
-97
lines changed

2 files changed

+36
-97
lines changed

clip.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,11 @@ class CLIPTextModel : public GGMLBlock {
711711
if (return_pooled) {
712712
auto text_projection = params["text_projection"];
713713
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
714-
pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled);
714+
if(text_projection != NULL){
715+
pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled);
716+
}else{
717+
LOG_DEBUG("Missing text_projection matrix, assuming identity...");
718+
}
715719
return pooled;
716720
}
717721

conditioner.hpp

Lines changed: 31 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,7 +1073,7 @@ struct FluxCLIPEmbedder : public Conditioner {
10731073
return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}};
10741074
}
10751075

1076-
SDCondition get_learned_condition_common(ggml_context* work_ctx,
1076+
SDCondition get_learned_condition_common(ggml_context* work_ctx,
10771077
int n_threads,
10781078
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
10791079
int clip_skip,
@@ -1084,100 +1084,62 @@ struct FluxCLIPEmbedder : public Conditioner {
10841084
auto& t5_tokens = token_and_weights[1].first;
10851085
auto& t5_weights = token_and_weights[1].second;
10861086

1087-
int64_t t0 = ggml_time_ms();
1088-
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
1089-
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
1090-
struct ggml_tensor* pooled = NULL; // [768,]
1087+
int64_t t0 = ggml_time_ms();
1088+
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
1089+
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
1090+
struct ggml_tensor* pooled = NULL; // [768,]
10911091
std::vector<float> hidden_states_vec;
10921092

1093-
size_t chunk_len_l = 77;
1094-
size_t chunk_count_l = clip_l_tokens.size() / chunk_len_l;
1095-
1096-
size_t chunk_len_t5 = 256;
1097-
size_t chunk_count_t5 = t5_tokens.size() / chunk_len_t5;
1098-
1099-
// TODO: I believe chunk_count_l is actually bigger than chunk_count_t5
1100-
// So this ignores some tokens for clip
1101-
size_t chunk_count = chunk_count_t5;
1102-
1093+
size_t chunk_len = 256;
1094+
size_t chunk_count = t5_tokens.size() / chunk_len;
11031095
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
1104-
struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l]
1105-
struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5]
11061096
// clip_l
1107-
if(chunk_idx < chunk_count_l) {
1108-
std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len_l,
1109-
clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len_l);
1110-
std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len_l,
1111-
clip_l_weights.begin() + (chunk_idx + 1) * chunk_len_l);
1097+
if (chunk_idx == 0) {
1098+
size_t chunk_len_l = 77;
1099+
std::vector<int> chunk_tokens(clip_l_tokens.begin(),
1100+
clip_l_tokens.begin() + chunk_len_l);
1101+
std::vector<float> chunk_weights(clip_l_weights.begin(),
1102+
clip_l_weights.begin() + chunk_len_l);
11121103

11131104
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
11141105
size_t max_token_idx = 0;
11151106

1107+
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
1108+
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
1109+
LOG_INFO("max_token_idx = %d",max_token_idx);
1110+
11161111
clip_l->compute(n_threads,
11171112
input_ids,
11181113
0,
11191114
NULL,
11201115
max_token_idx,
1121-
false,
1122-
&chunk_hidden_states_l,
1116+
true,
1117+
&pooled,
11231118
work_ctx);
1124-
{
1125-
auto tensor = chunk_hidden_states_l;
1126-
float original_mean = ggml_tensor_mean(tensor);
1127-
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
1128-
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
1129-
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
1130-
float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
1131-
value *= chunk_weights[i1];
1132-
ggml_tensor_set_f32(tensor, value, i0, i1, i2);
1133-
}
1134-
}
1135-
}
1136-
float new_mean = ggml_tensor_mean(tensor);
1137-
ggml_tensor_scale(tensor, (original_mean / new_mean));
1138-
}
1139-
if (chunk_idx == 0) {
1140-
std::vector<int> chunk_tokens(clip_l_tokens.begin(),
1141-
clip_l_tokens.begin() + chunk_len_l);
1142-
std::vector<float> chunk_weights(clip_l_weights.begin(),
1143-
clip_l_weights.begin() + chunk_len_l);
11441119

1145-
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
1146-
size_t max_token_idx = 0;
1120+
LOG_INFO("pooled->ne = [%d, %d, %d, %d] ",pooled->ne[0], pooled->ne[1], pooled->ne[2], pooled->ne[3]);
11471121

1148-
// auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
1149-
// max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
1150-
// clip_l->compute(n_threads,
1151-
// input_ids,
1152-
// 0,
1153-
// NULL,
1154-
// max_token_idx,
1155-
// true,
1156-
// &pooled,
1157-
// work_ctx);
1158-
1159-
// clip_l.transformer.text_model.text_projection no in file, ignore
1160-
// TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
1161-
pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
1162-
ggml_set_f32(pooled, 0.f);
1163-
}
1122+
// clip_l.transformer.text_model.text_projection no in file, ignore
1123+
// TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
1124+
// pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
1125+
// ggml_set_f32(pooled, 0.f);
11641126
}
11651127

11661128
// t5
1167-
if(chunk_idx < chunk_count_t5) {
1168-
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len_t5,
1169-
t5_tokens.begin() + (chunk_idx + 1) * chunk_len_t5);
1170-
std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len_t5,
1171-
t5_weights.begin() + (chunk_idx + 1) * chunk_len_t5);
1129+
{
1130+
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
1131+
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
1132+
std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
1133+
t5_weights.begin() + (chunk_idx + 1) * chunk_len);
11721134

11731135
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
11741136

11751137
t5->compute(n_threads,
11761138
input_ids,
1177-
&chunk_hidden_states_t5,
1139+
&chunk_hidden_states,
11781140
work_ctx);
11791141
{
1180-
auto tensor = chunk_hidden_states_t5;
1142+
auto tensor = chunk_hidden_states;
11811143
float original_mean = ggml_tensor_mean(tensor);
11821144
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
11831145
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
@@ -1193,33 +1155,6 @@ struct FluxCLIPEmbedder : public Conditioner {
11931155
}
11941156
}
11951157

1196-
1197-
// TODO: Maybe there's a better way to do the padding?
1198-
auto chunk_hidden_states_l_pad = ggml_new_tensor_3d(work_ctx,
1199-
chunk_hidden_states_l->type,
1200-
4096,
1201-
chunk_hidden_states_l->ne[1],
1202-
chunk_hidden_states_l->ne[2]); // [n_token, 4096]
1203-
1204-
for (int i2 = 0; i2 < chunk_hidden_states_l_pad->ne[2]; i2++) {
1205-
for (int i1 = 0; i1 < chunk_hidden_states_l_pad->ne[1]; i1++) {
1206-
for (int i0 = 0; i0 < chunk_hidden_states_l_pad->ne[0]; i0++) {
1207-
float value = 0.f;
1208-
if (i0 < chunk_hidden_states_l->ne[0]) {
1209-
value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
1210-
}
1211-
ggml_tensor_set_f32(chunk_hidden_states_l_pad, value, i0, i1, i2);
1212-
}
1213-
}
1214-
}
1215-
1216-
if(chunk_hidden_states_t5 == NULL){
1217-
chunk_hidden_states = chunk_hidden_states_l_pad;
1218-
} else {
1219-
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
1220-
}
1221-
1222-
12231158
int64_t t1 = ggml_time_ms();
12241159
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
12251160
if (force_zero_embeddings) {

0 commit comments

Comments
 (0)