Skip to content

Commit 4a8182d

Browse files
committed
Extended Rope Negative Offset for L1 and L2.
And more logs out of debug. And restauration of the old auto-rope system, and the one before it, where they were left.
1 parent 2c6ac20 commit 4a8182d

File tree

2 files changed

+39
-17
lines changed

2 files changed

+39
-17
lines changed

gpttype_adapter.cpp

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,29 +1633,37 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
16331633
float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
16341634
float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));
16351635

1636-
if(debugmode==1)
1637-
{
1638-
printf("Trained max context length (value:%.d).\n", n_ctx_train);
1639-
printf("Desired context length (value:%.d).\n", n_ctx_desired);
1640-
printf("Solar context multiplier (value:%.3f).\n", ctx_multiplier);
1641-
printf("Chi context train (value:%.3f).\n", chi_ctx_train_value);
1642-
printf("Chi chosen context (value:%.3f).\n", chi_ctx_value);
1643-
printf("Log Chi context train (value:%.3f).\n", log10f(chi_ctx_train_value));
1644-
printf("Log Chi chosen context (value:%.3f).\n", log10f(chi_ctx_value));
1645-
printf("RoPE Frequency Base value (value:%.3f).\n", original_rope_base);
1646-
printf("RoPE base calculated via Gradient AI formula. (value:%.1f).\n", gradient_ai_rope_freq_base_value);
1647-
}
1636+
// if(debugmode==1)
1637+
// {
1638+
printf("Trained max context length (value:%.d).\n", n_ctx_train);
1639+
printf("Desired context length (value:%.d).\n", n_ctx_desired);
1640+
printf("Solar context multiplier (value:%.3f).\n", ctx_multiplier);
1641+
printf("Chi context train (value:%.3f).\n", chi_ctx_train_value);
1642+
printf("Chi chosen context (value:%.3f).\n", chi_ctx_value);
1643+
printf("Log Chi context train (value:%.3f).\n", log10f(chi_ctx_train_value));
1644+
printf("Log Chi chosen context (value:%.3f).\n", log10f(chi_ctx_value));
1645+
printf("RoPE Frequency Base value (value:%.3f).\n", original_rope_base);
1646+
printf("RoPE base calculated via Gradient AI formula. (value:%.1f).\n", gradient_ai_rope_freq_base_value);
1647+
// }
16481648

16491649
if(model_arch==GGUFArch::ARCH_SOLAR)
16501650
{
16511651
float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
16521652
float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
1653-
if(debugmode==1)
1654-
{
1655-
printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
1656-
printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
1657-
}
1653+
// if(debugmode==1)
1654+
// {
1655+
printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
1656+
printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
1657+
// }
16581658
return rope_freq_base_with_positive_offset;
1659+
}
1660+
else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
1661+
{
1662+
float extended_rope_negative_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846));
1663+
float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
1664+
printf("Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n", extended_rope_negative_offset_value);
1665+
printf("RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n", rope_freq_base_with_negative_offset);
1666+
return rope_freq_base_with_negative_offset;
16591667
}
16601668
else
16611669
{
@@ -1721,6 +1729,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
17211729
}
17221730
else
17231731
{
1732+
//approximate NTK aware ctx
1733+
auto effectivenctx = kcpp_data->n_ctx;
1734+
if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
1735+
{
1736+
float factor = file_format_meta.n_ctx_train/2048;
1737+
effectivenctx = effectivenctx/factor;
1738+
}
1739+
float magic_multiplier = 8.0f;
1740+
float base_multiplier = effectivenctx*magic_multiplier;
1741+
float base_raw = 10000.0f;
1742+
rope_freq_base = (effectivenctx <= 2048 ? base_raw : base_multiplier);
1743+
//OLD : rope_freq_base = (effectivenctx <= 2048 ? 10000.0f : (effectivenctx <= 2176 ? 10000.0f : (effectivenctx <= 2304 ? 11000.0f : (effectivenctx <= 2432 ? 12000.0f : (effectivenctx <= 2560 ? 13000.0f : (effectivenctx <= 2688 ? 14000.0f : (effectivenctx <= 2816 ? 15000.0f : (effectivenctx <= 2944 ? 16000.0f : (effectivenctx <= 3072 ? 17000.0f : (effectivenctx <= 3200 ? 18000.0f : (effectivenctx <= 3328 ? 19000.0f : (effectivenctx <= 3456 ? 20000.0f : (effectivenctx <= 3584 ? 21000.0f : (effectivenctx <= 3712 ? 22000.0f : (effectivenctx <= 3840 ? 23000.0f : (effectivenctx <= 3968 ? 24000.0f : (effectivenctx <= 4096 ? 25000.0f : (effectivenctx <= 4224 ? 26000.0f : (effectivenctx <= 4352 ? 27000.0f : (effectivenctx <= 4480 ? 28500.0f : (effectivenctx <= 4608 ? 30000.0f : (effectivenctx <= 4736 ? 31500.0f : (effectivenctx <= 4864 ? 33000.0f : (effectivenctx <= 4992 ? 34500.0f : (effectivenctx <= 5120 ? 36000.0f : (effectivenctx <= 5248 ? 38000.0f : (effectivenctx <= 5376 ? 40000.0f : (effectivenctx <= 5504 ? 42000.0f : (effectivenctx <= 5632 ? 44000.0f : (effectivenctx <= 5760 ? 46000.0f : (effectivenctx <= 5888 ? 48000.0f : (effectivenctx <= 6016 ? 51000.0f : (effectivenctx <= 6144 ? 54000.0f : (effectivenctx <= 6288 ? 57000.0f : (effectivenctx <= 6400 ? 61000.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : (effectivenctx <= 16384 ? 200000.0f : (effectivenctx <= 24576 ? 320000.0f : 440000.0f))))))))))))))))))))))))))))))))))))))));
17241744
printf("Using Automatic RoPE scaling, Pre-GGUF (scale:%.3f, base:%.1f).\n",rope_freq_scale, rope_freq_base);
17251745
}
17261746
}

model_adapter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ enum GGUFArch
5959
ARCH_SOLAR = 4,
6060
ARCH_QWEN2 = 5,
6161
ARCH_RWKV = 6,
62+
ARCH_MISTRAL_LLAMA_1_AND_2 = 50,
6263
};
6364

6465
struct FileFormatExtraMeta
@@ -67,6 +68,7 @@ struct FileFormatExtraMeta
6768
int fileversion = 0;
6869
GGUFArch model_architecture = GGUFArch::ARCH_DEFAULT;
6970
int n_expert_count = 0;
71+
int32_t n_tensors;
7072
};
7173

7274
enum ModelLoadResult

0 commit comments

Comments
 (0)