88#define LLAMA_MAX_LAYERS 512
99#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
1010
11- enum llama_expert_gating_func_type {
12- LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0 ,
13- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1 ,
14- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2 ,
15- };
16-
17- enum llama_swa_type {
18- LLAMA_SWA_TYPE_NONE = 0 ,
19- LLAMA_SWA_TYPE_STANDARD = 1 ,
20- LLAMA_SWA_TYPE_CHUNKED = 2 ,
21- };
22-
11+ // Internal helper structs if they are not part of the public API
12+ // and are used by files including src/llama-hparams.h
13+ // If these are actually part of the public llama_hparams, they should be in include/llama.h
14+ // For now, assuming they might be used by other src files that include this.
2315struct llama_hparams_posnet {
2416 uint32_t n_embd;
2517 uint32_t n_layer;
@@ -30,161 +22,7 @@ struct llama_hparams_convnext {
3022 uint32_t n_layer;
3123};
3224
33- struct llama_hparams {
34- bool vocab_only;
35- bool rope_finetuned;
36- bool use_par_res;
37- bool swin_norm;
38-
39- uint32_t n_ctx_train; // context size the model was trained on
40- uint32_t n_embd;
41- uint32_t n_embd_features = 0 ;
42- uint32_t n_layer;
43- uint32_t n_rot;
44- uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
45- uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
46- uint32_t n_expert = 0 ;
47- uint32_t n_expert_used = 0 ;
48- uint32_t n_rel_attn_bkts = 0 ;
49-
50- // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
51- uint32_t n_embd_head_k_mla = 0 ;
52- uint32_t n_embd_head_v_mla = 0 ;
53-
54- // for WavTokenizer
55- struct llama_hparams_posnet posnet;
56- struct llama_hparams_convnext convnext;
57-
58- std::array<uint32_t , LLAMA_MAX_LAYERS> n_head_arr;
59- std::array<uint32_t , LLAMA_MAX_LAYERS> n_head_kv_arr;
60- std::array<uint32_t , LLAMA_MAX_LAYERS> n_ff_arr;
61-
62- uint32_t n_layer_dense_lead = 0 ;
63- uint32_t n_lora_q = 0 ;
64- uint32_t n_lora_kv = 0 ;
65- uint32_t n_ff_exp = 0 ;
66- uint32_t n_ff_shexp = 0 ;
67- uint32_t n_expert_shared = 0 ;
68- uint32_t n_norm_groups = 0 ;
69-
70- float expert_weights_scale = 0.0 ;
71- bool expert_weights_norm = false ;
72- uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
73- uint32_t moe_every_n_layers = 0 ;
74-
75- float f_norm_eps;
76- float f_norm_rms_eps;
77- float f_norm_group_eps;
78-
79- float f_attn_logit_softcapping = 50 .0f ;
80- float f_final_logit_softcapping = 30 .0f ;
81-
82- // for RWKV
83- uint32_t rescale_every_n_layers = 0 ;
84- uint32_t time_mix_extra_dim = 0 ;
85- uint32_t time_decay_extra_dim = 0 ;
86- uint32_t wkv_head_size = 0 ;
87- uint32_t token_shift_count = 2 ;
88- uint32_t n_lora_decay = 0 ;
89- uint32_t n_lora_iclr = 0 ;
90- uint32_t n_lora_value_res_mix = 0 ;
91- uint32_t n_lora_gate = 0 ;
92-
93- float rope_attn_factor = 1 .0f ;
94- float rope_freq_base_train;
95- float rope_freq_base_train_swa;
96- float rope_freq_scale_train;
97- float rope_freq_scale_train_swa;
98- uint32_t n_ctx_orig_yarn;
99- float rope_yarn_log_mul;
100-
101- std::array<int , 4 > rope_sections;
102-
103- // Sliding Window Attention (SWA)
104- llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
105- // the size of the sliding window (0 - no SWA)
106- uint32_t n_swa = 0 ;
107- // if swa_layers[il] == true, then layer il is SWA
108- // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
109- // by default, all layers are dense
110- std::array<bool , LLAMA_MAX_LAYERS> swa_layers;
111-
112- // for State Space Models
113- uint32_t ssm_d_conv = 0 ;
114- uint32_t ssm_d_inner = 0 ;
115- uint32_t ssm_d_state = 0 ;
116- uint32_t ssm_dt_rank = 0 ;
117-
118- bool ssm_dt_b_c_rms = false ;
119-
120- float f_clamp_kqv = 0 .0f ;
121- float f_max_alibi_bias = 0 .0f ;
122- float f_logit_scale = 0 .0f ;
123-
124- // Additional scale factors (Granite/Granite MoE)
125- float f_residual_scale = 0 .0f ;
126- float f_embedding_scale = 0 .0f ;
127- float f_attention_scale = 0 .0f ;
128-
129- bool causal_attn = true ;
130- bool use_alibi = false ;
131- bool attn_soft_cap = false ;
132- bool use_kq_norm = true ;
133-
134- // llama4
135- uint32_t n_moe_layer_step = 0 ;
136- uint32_t n_no_rope_layer_step = 4 ;
137- uint32_t n_attn_temp_floor_scale = 8192 ;
138- float f_attn_temp_scale = 0.1 ;
139-
140- // needed by encoder-decoder models (e.g. T5, FLAN-T5)
141- // ref: https://github.com/ggerganov/llama.cpp/pull/8141
142- llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
143-
144- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
145- enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
146- enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
147-
148- // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
149- // note that if n_pattern == 0, all layers are SWA
150- // if n_pattern == 1, all layers are dense
151- // example: n_pattern = 3
152- // il == 0: swa
153- // il == 1: swa
154- // il == 2: dense
155- // il == 3: swa
156- // il == 4: swa
157- // il == 5: dense
158- // il == 6: swa
159- // etc ...
160- void set_swa_pattern (uint32_t n_pattern);
161-
162- // return true if one of the layers is SWA
163- bool is_swa_any () const ;
164-
165- uint32_t n_head (uint32_t il = 0 ) const ;
166-
167- uint32_t n_head_kv (uint32_t il = 0 ) const ;
168-
169- uint32_t n_ff (uint32_t il = 0 ) const ;
170-
171- uint32_t n_gqa (uint32_t il = 0 ) const ;
172-
173- // dimension of key embeddings across all k-v heads
174- uint32_t n_embd_k_gqa (uint32_t il = 0 ) const ;
175-
176- // dimension of value embeddings across all k-v heads
177- uint32_t n_embd_v_gqa (uint32_t il = 0 ) const ;
178-
179- // dimension of the rolling state embeddings
180- // corresponds to Mamba's conv_states size or RWKV's token_shift states size
181- uint32_t n_embd_k_s () const ;
182-
183- // dimension of the recurrent state embeddings
184- uint32_t n_embd_v_s () const ;
185-
186- bool is_swa (uint32_t il) const ;
187- };
188-
189- static_assert (std::is_trivially_copyable<llama_hparams>::value, " llama_hparams must be trivially copyable" );
190-
25+ // All other definitions previously in this file (LLAMA_MAX_LAYERS,
26+ // enum llama_expert_gating_func_type, enum llama_swa_type,
27+ // struct llama_hparams, and the static_assert) are removed
28+ // to defer to the definitions in "llama.h".
0 commit comments