@@ -1633,29 +1633,37 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
16331633 float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318 ;
16341634 float gradient_ai_rope_freq_base_value = powf (original_rope_base, log10f (chi_ctx_value) / log10f (chi_ctx_train_value));
16351635
1636- if (debugmode==1 )
1637- {
1638- printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
1639- printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
1640- printf (" Solar context multiplier (value:%.3f).\n " , ctx_multiplier);
1641- printf (" Chi context train (value:%.3f).\n " , chi_ctx_train_value);
1642- printf (" Chi chosen context (value:%.3f).\n " , chi_ctx_value);
1643- printf (" Log Chi context train (value:%.3f).\n " , log10f (chi_ctx_train_value));
1644- printf (" Log Chi chosen context (value:%.3f).\n " , log10f (chi_ctx_value));
1645- printf (" RoPE Frequency Base value (value:%.3f).\n " , original_rope_base);
1646- printf (" RoPE base calculated via Gradient AI formula. (value:%.1f).\n " , gradient_ai_rope_freq_base_value);
1647- }
1636+ // if(debugmode==1)
1637+ // {
1638+ printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
1639+ printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
1640+ printf (" Solar context multiplier (value:%.3f).\n " , ctx_multiplier);
1641+ printf (" Chi context train (value:%.3f).\n " , chi_ctx_train_value);
1642+ printf (" Chi chosen context (value:%.3f).\n " , chi_ctx_value);
1643+ printf (" Log Chi context train (value:%.3f).\n " , log10f (chi_ctx_train_value));
1644+ printf (" Log Chi chosen context (value:%.3f).\n " , log10f (chi_ctx_value));
1645+ printf (" RoPE Frequency Base value (value:%.3f).\n " , original_rope_base);
1646+ printf (" RoPE base calculated via Gradient AI formula. (value:%.1f).\n " , gradient_ai_rope_freq_base_value);
1647+ // }
16481648
16491649 if (model_arch==GGUFArch::ARCH_SOLAR)
16501650 {
16511651 float extended_rope_positive_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / ((log10f (chi_ctx_value) * log10f (chi_ctx_train_value)) - (log10f (chi_ctx_value) + log10f (chi_ctx_train_value))));
16521652 float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
1653- if (debugmode==1 )
1654- {
1655- printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
1656- printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
1657- }
1653+ // if(debugmode==1)
1654+ // {
1655+ printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
1656+ printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
1657+ // }
16581658 return rope_freq_base_with_positive_offset;
1659+ }
1660+ else if (model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
1661+ {
1662+ float extended_rope_negative_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846 ));
1663+ float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
1664+ printf (" Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n " , extended_rope_negative_offset_value);
1665+ printf (" RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n " , rope_freq_base_with_negative_offset);
1666+ return rope_freq_base_with_negative_offset;
16591667 }
16601668 else
16611669 {
@@ -1721,6 +1729,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
17211729 }
17221730 else
17231731 {
1732+ // approximate NTK aware ctx
1733+ auto effectivenctx = kcpp_data->n_ctx ;
1734+ if ((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048 )
1735+ {
1736+ float factor = file_format_meta.n_ctx_train /2048 ;
1737+ effectivenctx = effectivenctx/factor;
1738+ }
1739+ float magic_multiplier = 8 .0f ;
1740+ float base_multiplier = effectivenctx*magic_multiplier;
1741+ float base_raw = 10000 .0f ;
1742+ rope_freq_base = (effectivenctx <= 2048 ? base_raw : base_multiplier);
1743+ //OLD : rope_freq_base = (effectivenctx <= 2048 ? 10000.0f : (effectivenctx <= 2176 ? 10000.0f : (effectivenctx <= 2304 ? 11000.0f : (effectivenctx <= 2432 ? 12000.0f : (effectivenctx <= 2560 ? 13000.0f : (effectivenctx <= 2688 ? 14000.0f : (effectivenctx <= 2816 ? 15000.0f : (effectivenctx <= 2944 ? 16000.0f : (effectivenctx <= 3072 ? 17000.0f : (effectivenctx <= 3200 ? 18000.0f : (effectivenctx <= 3328 ? 19000.0f : (effectivenctx <= 3456 ? 20000.0f : (effectivenctx <= 3584 ? 21000.0f : (effectivenctx <= 3712 ? 22000.0f : (effectivenctx <= 3840 ? 23000.0f : (effectivenctx <= 3968 ? 24000.0f : (effectivenctx <= 4096 ? 25000.0f : (effectivenctx <= 4224 ? 26000.0f : (effectivenctx <= 4352 ? 27000.0f : (effectivenctx <= 4480 ? 28500.0f : (effectivenctx <= 4608 ? 30000.0f : (effectivenctx <= 4736 ? 31500.0f : (effectivenctx <= 4864 ? 33000.0f : (effectivenctx <= 4992 ? 34500.0f : (effectivenctx <= 5120 ? 36000.0f : (effectivenctx <= 5248 ? 38000.0f : (effectivenctx <= 5376 ? 40000.0f : (effectivenctx <= 5504 ? 42000.0f : (effectivenctx <= 5632 ? 44000.0f : (effectivenctx <= 5760 ? 46000.0f : (effectivenctx <= 5888 ? 48000.0f : (effectivenctx <= 6016 ? 51000.0f : (effectivenctx <= 6144 ? 54000.0f : (effectivenctx <= 6288 ? 57000.0f : (effectivenctx <= 6400 ? 61000.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : (effectivenctx <= 16384 ? 200000.0f : (effectivenctx <= 24576 ? 320000.0f : 440000.0f))))))))))))))))))))))))))))))))))))))));
17241744 printf (" Using Automatic RoPE scaling, Pre-GGUF (scale:%.3f, base:%.1f).\n " ,rope_freq_scale, rope_freq_base);
17251745 }
17261746 }
0 commit comments