Skip to content

Commit 0b37fff

Browse files
committed
code clean
1 parent 794481e commit 0b37fff

File tree

3 files changed

+24
-24
lines changed

3 files changed

+24
-24
lines changed

gguf-py/gguf/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ class ClipVision:
278278
USE_GELU = "clip.use_gelu"
279279
USE_SILU = "clip.use_silu"
280280
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
281-
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
281+
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
282282

283283
class Attention:
284284
HEAD_COUNT = "clip.vision.attention.head_count"
@@ -646,8 +646,8 @@ class MODEL_TENSOR(IntEnum):
646646
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
647647
V_MM_PATCH_MERGER = auto() # mistral small 3.1
648648
V_DS_NORM = auto() # qwen3vl
649-
V_DS_FC1 = auto() # qwen3vl
650-
V_DS_FC2 = auto() # qwen3vl
649+
V_DS_FC1 = auto() # qwen3vl
650+
V_DS_FC2 = auto() # qwen3vl
651651
# audio (mtmd)
652652
A_ENC_EMBD_POS = auto()
653653
A_ENC_CONV1D = auto()

src/llama-model.cpp

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10271027
} break;
10281028
case LLM_ARCH_QWEN3VL:
10291029
{
1030-
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, 0);
1030+
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
10311031
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
10321032
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
10331033
switch (hparams.n_layer) {
@@ -1036,8 +1036,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10361036
case 64: type = LLM_TYPE_32B; break;
10371037
default: type = LLM_TYPE_UNKNOWN;
10381038
}
1039-
// for deepstack patch, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
1040-
hparams.n_embd = hparams.n_embd * (hparams.n_deepstack_layers + 1);
1039+
// since vision model stacks deepstack features along feature dim
1040+
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1041+
hparams.n_embd *= hparams.n_deepstack_layers + 1;
10411042
} break;
10421043
case LLM_ARCH_QWEN3MOE:
10431044
{
@@ -1052,17 +1053,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10521053
} break;
10531054
case LLM_ARCH_QWEN3VLMOE:
10541055
{
1055-
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, 0);
1056+
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
10561057
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1057-
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1058+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
10581059
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
10591060
switch (hparams.n_layer) {
10601061
case 48: type = LLM_TYPE_30B_A3B; break;
10611062
case 94: type = LLM_TYPE_235B_A22B; break;
10621063
default: type = LLM_TYPE_UNKNOWN;
10631064
}
1064-
// for deepstack patch, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
1065-
hparams.n_embd = hparams.n_embd * (hparams.n_deepstack_layers + 1);
1065+
// since vision model stacks deepstack features along feature dim
1066+
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1067+
hparams.n_embd *= hparams.n_deepstack_layers + 1;
10661068
} break;
10671069
case LLM_ARCH_PHI2:
10681070
{
@@ -3307,11 +3309,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33073309
case LLM_ARCH_QWEN3:
33083310
case LLM_ARCH_QWEN3VL:
33093311
{
3310-
int64_t n_embd = hparams.n_embd;
3311-
// for deepstack features, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
3312-
if (arch == LLM_ARCH_QWEN3VL) {
3313-
n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
3314-
}
3312+
// for model loading, the weights only have the main embd
3313+
// so we need to divide by the number of deepstack layers + 1
3314+
// n_embd is const int so we declare a new variable
3315+
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33153316
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33163317

33173318
// output
@@ -3347,11 +3348,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33473348
case LLM_ARCH_QWEN3MOE:
33483349
case LLM_ARCH_QWEN3VLMOE:
33493350
{
3350-
// for deepstack features, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
3351-
int64_t n_embd = hparams.n_embd;
3352-
if (arch == LLM_ARCH_QWEN3VLMOE) {
3353-
n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
3354-
}
3351+
// for model loading, the weights only have the main embd
3352+
// so we need to divide by the number of deepstack layers + 1
3353+
// n_embd is const int so we declare a new variable
3354+
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33553355
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33563356

33573357
// output

tools/mtmd/clip-impl.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
4040
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
4141
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
42-
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
42+
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
4343

4444
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
4545
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -94,9 +94,9 @@
9494
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
9595
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
9696
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
97-
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
98-
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
99-
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
97+
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
98+
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
99+
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
100100

101101
// mimicpmv
102102
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"

0 commit comments

Comments
 (0)