no ctx shift for all mrope

LostRuins · LostRuins · commit b87864144b99 · 2025-07-25T13:53:20.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -1852,6 +1852,8 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
         }
     }
 
+    //printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
+
     if(!purgeneeded || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < ShortfallThreshold)
     {
         return; //no purge is needed
@@ -1865,7 +1867,7 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
 
     auto shared = LongestCommonSubseq(curr_ctx_without_memory, new_ctx_without_memory);
 
-    // printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
+    //printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
     if (shared.size() > LCSTokThreshold && ArrStartWith(new_ctx_without_memory, shared)) // enough tokens in common
     {
         int found = ArrFindIndexOf(current_context_tokens,shared);
@@ -2209,11 +2211,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         {
             printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n");
         }
-        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
-        {
-            printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n");
-            kcpp_data->use_contextshift = false;
-        }
+
         model_params.main_gpu = kcpp_parseinfo_maindevice;
 
         #if defined(GGML_USE_CUDA)
@@ -2334,6 +2332,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
+        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)
+        {
+            printf("\nMRope is used, context shift will be disabled!\n");
+            kcpp_data->use_contextshift = false;
+        }
 
         if(overwriteRope)
         {

Original file line number	Diff line number	Diff line change
`@@ -1852,6 +1852,8 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec`
`1852`	`1852`	`}`
`1853`	`1853`	`}`
`1854`	`1854`
	`1855`	`+ //printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);`
	`1856`	`+`
`1855`	`1857`	`if(!purgeneeded \|\| new_tokens_len < 6 \|\| current_context_tokens.size() < 6 \|\| new_tokens_len - trimstart < ShortfallThreshold)`
`1856`	`1858`	`{`
`1857`	`1859`	`return; //no purge is needed`
`@@ -1865,7 +1867,7 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec`
`1865`	`1867`
`1866`	`1868`	`auto shared = LongestCommonSubseq(curr_ctx_without_memory, new_ctx_without_memory);`
`1867`	`1869`
`1868`		`- // printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));`
	`1870`	`+ //printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));`
`1869`	`1871`	`if (shared.size() > LCSTokThreshold && ArrStartWith(new_ctx_without_memory, shared)) // enough tokens in common`
`1870`	`1872`	`{`
`1871`	`1873`	`int found = ArrFindIndexOf(current_context_tokens,shared);`
`@@ -2209,11 +2211,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2209`	`2211`	`{`
`2210`	`2212`	`printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n");`
`2211`	`2213`	`}`
`2212`		`- if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)`
`2213`		`- {`
`2214`		`- printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n");`
`2215`		`- kcpp_data->use_contextshift = false;`
`2216`		`- }`
	`2214`	`+`
`2217`	`2215`	`model_params.main_gpu = kcpp_parseinfo_maindevice;`
`2218`	`2216`
`2219`	`2217`	`#if defined(GGML_USE_CUDA)`
`@@ -2334,6 +2332,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2334`	`2332`	`}`
`2335`	`2333`
`2336`	`2334`	`llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);`
	`2335`	`+ if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL \|\| llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)`
	`2336`	`+ {`
	`2337`	`+ printf("\nMRope is used, context shift will be disabled!\n");`
	`2338`	`+ kcpp_data->use_contextshift = false;`
	`2339`	`+ }`
`2337`	`2340`
`2338`	`2341`	`if(overwriteRope)`
`2339`	`2342`	`{`