Skip to content

Commit e16e397

Browse files
committed
move max_feature_layer to build_llava
1 parent 47517a1 commit e16e397

File tree

1 file changed

+23
-27
lines changed

1 file changed

+23
-27
lines changed

tools/mtmd/clip.cpp

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,6 @@ struct clip_ctx {
329329
struct clip_vision_model vision_model;
330330
projector_type proj_type = PROJECTOR_TYPE_MLP;
331331

332-
int32_t max_feature_layer; // unused in newer models like gemma3
333332
float image_mean[3];
334333
float image_std[3];
335334

@@ -867,6 +866,27 @@ struct clip_graph {
867866

868867
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
869868

869+
// Calculate the deepest feature layer based on hparams and projector type
870+
int max_feature_layer = n_layer;
871+
{
872+
// Get the index of the second to last layer; this is the default for models that have a llava projector
873+
int il_last = hparams.n_layer - 1;
874+
int deepest_feature_layer = -1;
875+
876+
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
877+
il_last += 1;
878+
}
879+
880+
// If we set explicit vision feature layers, only go up to the deepest one
881+
// NOTE: only used by granite-vision models for now
882+
for (const auto & feature_layer : hparams.vision_feature_layer) {
883+
if (feature_layer > deepest_feature_layer) {
884+
deepest_feature_layer = feature_layer;
885+
}
886+
}
887+
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
888+
}
889+
870890
ggml_tensor * inp = build_inp();
871891

872892
if (model.patch_bias) {
@@ -896,7 +916,7 @@ struct clip_graph {
896916
const auto & vision_feature_layer = hparams.vision_feature_layer;
897917

898918
// loop over layers
899-
for (int il = 0; il < ctx->max_feature_layer; il++) {
919+
for (int il = 0; il < max_feature_layer; il++) {
900920
auto & layer = model.layers[il];
901921
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
902922

@@ -977,7 +997,7 @@ struct clip_graph {
977997
// process vision feature layers (used by granite)
978998
{
979999
// final layer is a vision feature layer
980-
if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
1000+
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
9811001
embedding_stack.push_back(inpL);
9821002
}
9831003

@@ -1760,30 +1780,6 @@ struct clip_model_loader {
17601780
hparams.vision_feature_layer.insert(layer);
17611781
}
17621782

1763-
// Calculate the deepest feature layer based on hparams and projector type
1764-
// NOTE: This is only used by build_graph_legacy()
1765-
{
1766-
// Get the index of the second to last layer; this is the default for models that have a llava projector
1767-
int n_layer = hparams.n_layer - 1;
1768-
int deepest_feature_layer = -1;
1769-
1770-
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1771-
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1772-
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1773-
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1774-
n_layer += 1;
1775-
}
1776-
1777-
// If we set explicit vision feature layers, only go up to the deepest one
1778-
// NOTE: only used by granite-vision models for now
1779-
for (const auto & feature_layer : hparams.vision_feature_layer) {
1780-
if (feature_layer > deepest_feature_layer) {
1781-
deepest_feature_layer = feature_layer;
1782-
}
1783-
}
1784-
ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
1785-
}
1786-
17871783
// model-specific params
17881784
switch (ctx_clip.proj_type) {
17891785
case PROJECTOR_TYPE_MINICPMV:

0 commit comments

Comments
 (0)