@@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16351635 // that have no expert_gating_func model parameter set
16361636 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
16371637 }
1638- ml.get_key (LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul , false );
1638+
1639+ if (ml.get_key (LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul , 0 .0f )) {
1640+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1641+ // cancel the factor from the convert script
1642+ hparams.rope_yarn_log_mul /= 0 .1f ;
1643+ }
16391644
16401645 // (optional) temperature tuning - used by mistral-large
16411646 ml.get_key (LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale , false );
@@ -2267,9 +2272,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
22672272 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
22682273 ml.get_key (LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale , false );
22692274
2270- ml.get_key (LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast , false );
2271- ml.get_key (LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow , false );
2272- ml.get_key (LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul , false );
2275+ ml.get_key (LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast , false );
2276+ ml.get_key (LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow , false );
2277+ ml.get_key (LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul , 0 . 0f );
22732278
22742279 // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
22752280 if (hparams.f_attn_temp_scale != 0 .0f ) {
@@ -2279,18 +2284,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
22792284 }
22802285 }
22812286
2282- // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2283- // but may need further verification with other values
2284- if (hparams.rope_yarn_log_mul != 0 .0f ) {
2285- float factor = 1 .0f / hparams.rope_freq_scale_train ;
2286- float mscale = 1 .0f ;
2287- float mscale_all_dims = hparams.rope_yarn_log_mul ;
2288- static auto get_mscale = [](float scale, float mscale) {
2289- return scale <= 1 .0f ? 1 .0f : (0 .1f * mscale * logf (scale) + 1 .0f );
2290- };
2291- hparams.yarn_attn_factor = get_mscale (factor, mscale) / get_mscale (factor, mscale_all_dims);
2292- }
2293-
22942287 switch (hparams.n_layer ) {
22952288 case 26 : type = LLM_TYPE_3B; break ;
22962289 case 34 : type = LLM_TYPE_8B; break ;
@@ -2301,6 +2294,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
23012294 default : throw std::runtime_error (" unsupported model architecture" );
23022295 }
23032296
2297+ // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
2298+ if (hparams.rope_yarn_log_mul != 0 .0f ) {
2299+ const float factor = 1 .0f / hparams.rope_freq_scale_train ;
2300+
2301+ // note: here we assume `mscale == 1.0f`
2302+ // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
2303+ float mscale = 1 .0f ;
2304+ const float mscale_all_dims = hparams.rope_yarn_log_mul ;
2305+
2306+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
2307+ // special-case DEEPSEEK v2:
2308+ // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
2309+ if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1 .0f ) {
2310+ mscale = mscale_all_dims;
2311+ }
2312+
2313+ static auto get_mscale = [](float scale, float mscale) {
2314+ return scale <= 1 .0f ? 1 .0f : (0 .1f * mscale * logf (scale) + 1 .0f );
2315+ };
2316+
2317+ hparams.yarn_attn_factor = get_mscale (factor, mscale) / get_mscale (factor, mscale_all_dims);
2318+
2319+ LLAMA_LOG_WARN (" %s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n " ,
2320+ __func__, hparams.yarn_attn_factor , mscale, mscale_all_dims);
2321+ }
2322+
23042323 pimpl->n_bytes = ml.n_bytes ;
23052324
23062325 pimpl->desc_str = arch_name () + " " + type_name () + " " + ml.ftype_name ();
@@ -6806,6 +6825,7 @@ void llama_model::print_info() const {
68066825 LLAMA_LOG_INFO (" %s: freq_base_train = %.1f\n " , __func__, hparams.rope_freq_base_train );
68076826 LLAMA_LOG_INFO (" %s: freq_scale_train = %g\n " , __func__, hparams.rope_freq_scale_train );
68086827 LLAMA_LOG_INFO (" %s: n_ctx_orig_yarn = %u\n " , __func__, hparams.n_ctx_orig_yarn );
6828+ LLAMA_LOG_INFO (" %s: rope_yarn_log_mul= %.4f\n " , __func__, hparams.rope_yarn_log_mul );
68096829 LLAMA_LOG_INFO (" %s: rope_finetuned = %s\n " , __func__, hparams.rope_finetuned ? " yes" : " unknown" );
68106830 // MRoPE (Multi-axis Rotary Position Embedding) sections
68116831 if (const auto & s = hparams.rope_sections ; s[0 ] || s[1 ] || s[2 ] || s[3 ]) {
@@ -6869,7 +6889,6 @@ void llama_model::print_info() const {
68696889 LLAMA_LOG_INFO (" %s: expert_weights_scale = %.1f\n " , __func__, hparams.expert_weights_scale );
68706890 LLAMA_LOG_INFO (" %s: expert_weights_norm = %d\n " , __func__, hparams.expert_weights_norm );
68716891 LLAMA_LOG_INFO (" %s: expert_gating_func = %s\n " , __func__, llama_expert_gating_func_name ((llama_expert_gating_func_type) hparams.expert_gating_func ));
6872- LLAMA_LOG_INFO (" %s: rope_yarn_log_mul = %.4f\n " , __func__, hparams.rope_yarn_log_mul );
68736892 }
68746893
68756894 if (arch == LLM_ARCH_QWEN2MOE) {
0 commit comments