@@ -1852,6 +1852,8 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18521852 }
18531853 }
18541854
1855+ // printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
1856+
18551857 if (!purgeneeded || new_tokens_len < 6 || current_context_tokens.size () < 6 || new_tokens_len - trimstart < ShortfallThreshold)
18561858 {
18571859 return ; // no purge is needed
@@ -1865,7 +1867,7 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18651867
18661868 auto shared = LongestCommonSubseq (curr_ctx_without_memory, new_ctx_without_memory);
18671869
1868- // printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
1870+ // printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
18691871 if (shared.size () > LCSTokThreshold && ArrStartWith (new_ctx_without_memory, shared)) // enough tokens in common
18701872 {
18711873 int found = ArrFindIndexOf (current_context_tokens,shared);
@@ -2209,11 +2211,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
22092211 {
22102212 printf (" Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n " );
22112213 }
2212- if (file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
2213- {
2214- printf (" Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n " );
2215- kcpp_data->use_contextshift = false ;
2216- }
2214+
22172215 model_params.main_gpu = kcpp_parseinfo_maindevice;
22182216
22192217 #if defined(GGML_USE_CUDA)
@@ -2334,6 +2332,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23342332 }
23352333
23362334 llama_model * llamamodel = llama_model_load_from_file (kcpp_data->model_filename .c_str (), model_params);
2335+ if (file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type (llamamodel)==LLAMA_ROPE_TYPE_MROPE)
2336+ {
2337+ printf (" \n MRope is used, context shift will be disabled!\n " );
2338+ kcpp_data->use_contextshift = false ;
2339+ }
23372340
23382341 if (overwriteRope)
23392342 {
0 commit comments