Skip to content

Commit 07e7716

Browse files
Fix unset vision layer 0
Signed-off-by: Alex-Brooks <[email protected]>
1 parent ee6fb4d commit 07e7716

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

examples/llava/clip.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -770,8 +770,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770770

771771
// If this is an embedding feature layer, save the output.
772772
// NOTE: 0 index here refers to the input to the encoder.
773-
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
774-
if (il == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
773+
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
774+
if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
775775
embedding_stack.push_back(embeddings);
776776
break;
777777
}
@@ -875,8 +875,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875875
}
876876

877877
// final layer is a vision feature layer
878-
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
879-
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
878+
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
879+
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
880880
embedding_stack.push_back(embeddings);
881881
break;
882882
}
@@ -2995,7 +2995,8 @@ size_t get_max_image_grid_pinpoints() {
29952995
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29962996
// Get the index of the second to last layer; this is the
29972997
// default for models that have a llava projector
2998-
int n_layer = ctx->vision_model.hparams.n_layer - 1;
2998+
const auto & hparams = ctx->vision_model.hparams;
2999+
int n_layer = hparams.n_layer - 1;
29993000
int deepest_feature_layer = -1;
30003001

30013002
// Handle other projectors; incrementing here indicates that we
@@ -3005,9 +3006,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30053006
}
30063007

30073008
// If we set explicit vision feature layers, only go up to the deepest one
3008-
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS; i++) {
3009-
if (ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
3010-
deepest_feature_layer = ctx->vision_model.hparams.vision_feature_layer[i];
3009+
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
3010+
if (hparams.vision_feature_layer[i] > deepest_feature_layer) {
3011+
deepest_feature_layer = hparams.vision_feature_layer[i];
30113012
}
30123013
}
30133014
return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;

0 commit comments

Comments
 (0)