Skip to content

Commit b878641

Browse files
committed
no ctx shift for all mrope
1 parent 8f622cf commit b878641

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

gpttype_adapter.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1852,6 +1852,8 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18521852
}
18531853
}
18541854

1855+
//printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
1856+
18551857
if(!purgeneeded || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < ShortfallThreshold)
18561858
{
18571859
return; //no purge is needed
@@ -1865,7 +1867,7 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18651867

18661868
auto shared = LongestCommonSubseq(curr_ctx_without_memory, new_ctx_without_memory);
18671869

1868-
// printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
1870+
//printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
18691871
if (shared.size() > LCSTokThreshold && ArrStartWith(new_ctx_without_memory, shared)) // enough tokens in common
18701872
{
18711873
int found = ArrFindIndexOf(current_context_tokens,shared);
@@ -2209,11 +2211,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
22092211
{
22102212
printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n");
22112213
}
2212-
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
2213-
{
2214-
printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n");
2215-
kcpp_data->use_contextshift = false;
2216-
}
2214+
22172215
model_params.main_gpu = kcpp_parseinfo_maindevice;
22182216

22192217
#if defined(GGML_USE_CUDA)
@@ -2334,6 +2332,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23342332
}
23352333

23362334
llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
2335+
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)
2336+
{
2337+
printf("\nMRope is used, context shift will be disabled!\n");
2338+
kcpp_data->use_contextshift = false;
2339+
}
23372340

23382341
if(overwriteRope)
23392342
{

0 commit comments

Comments
 (0)