Skip to content

Commit 48a941d

Browse files
Use current defaults for feature layer
Signed-off-by: Alex-Brooks <[email protected]>
1 parent 8b6db6c commit 48a941d

File tree

2 files changed

+28
-17
lines changed

2 files changed

+28
-17
lines changed

examples/llava/clip.cpp

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -754,22 +754,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
754754
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
755755
}
756756

757-
// Check to see we have 1+ set vision feature layers set; otherwise it's the last layer
758757
std::vector<struct ggml_tensor *> embedding_stack;
759-
bool has_feature_layers = ctx->vision_model.hparams.vision_feature_layer[0] > 0;
760-
// Determine how many encoder layers we need to process; if we have explicit vision feature
761-
// layers, only process what we need, otherwise process all of the visual encoder layers.
762-
int max_feature_layer = -1;
763-
if(has_feature_layers) {
764-
for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) {
765-
if(ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx] > max_feature_layer) {
766-
max_feature_layer = ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx];
767-
}
768-
}
769-
}
770-
if(max_feature_layer < 0) {
771-
max_feature_layer = n_layer;
772-
}
758+
// Check to see we have 1+ set vision feature layers set; otherwise it's determined
759+
// by the type of projector that this model has (usually last or second to last layer).
760+
int max_feature_layer = get_deepest_feature_layer(ctx);
773761

774762
// loop over layers
775763
for (int il = 0; il < max_feature_layer; il++) {
@@ -890,7 +878,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
890878
}
891879

892880
// If feature layers are explicitly set, stack them (if we have multiple)
893-
if(has_feature_layers && embedding_stack.size() > 0) {
881+
if(embedding_stack.size() > 0) {
894882
embeddings = embedding_stack.at(0);
895883
for(unsigned long i=1; i < embedding_stack.size(); i++) {
896884
embeddings = ggml_concat(ctx0, embeddings, embedding_stack.at(i), 0);
@@ -2990,6 +2978,27 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29902978
return ctx->has_qwen2vl_merger;
29912979
}
29922980

2981+
// Determine the number of encoder layers to iterate over
2982+
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
2983+
// Get the index of the second to last layer; this is the
2984+
// default for models that have a llava projector
2985+
int n_layer = ctx->vision_model.hparams.n_layer - 1;
2986+
int deepest_feature_layer = -1;
2987+
2988+
// Handle other projectors; incrementing here indicates that we
2989+
// should use the last encoder layer for the vision features.
2990+
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
2991+
n_layer += 1;
2992+
}
2993+
2994+
// If we set explicit vision feature layers, only go up to the deepest one
2995+
for(int i = 0; i < 4; i++) {
2996+
if(ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
2997+
deepest_feature_layer = ctx->vision_model.hparams.vision_feature_layer[i];
2998+
}
2999+
}
3000+
return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;
3001+
}
29933002

29943003
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
29953004
clip_image_f32 clip_img;

examples/llava/clip.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
8989
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
9090

9191
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
92+
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
9293
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
9394

95+
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
96+
9497
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
9598

96-
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
9799

98100
#ifdef __cplusplus
99101
}

0 commit comments

Comments
 (0)