@@ -3810,11 +3810,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
38103810 bool blank_prompt = (addedmemory==" " && kcpp_data->prompt ==" " );
38113811
38123812 // smart cache logic
3813- if (kcpp_data->smartcache )
3813+ if (kcpp_data->smartcache && file_format==FileFormat::GGUF_GENERIC )
38143814 {
3815+ bool shiftable = true ;
3816+ if (!kcpp_data->use_contextshift || is_recurrent)
3817+ {
3818+ shiftable = false ;
3819+ }
38153820 const float similarity_threshold = 0 .7f ;
38163821 // If CanBeShifted is true, do nothing. Allow shift as normal.
3817- if (!CanContextShift (current_context_tokens, embd_inp, inputs.max_length , nctx))
3822+ if (!(shiftable && CanContextShift (current_context_tokens, embd_inp, inputs.max_length , nctx) ))
38183823 {
38193824 // If CanBeShifted is false, calculate prefix similarity with current_context_tokens of current context
38203825 // If similarity > similarity_threshold, do nothing. Allow fast forward as normal.
@@ -3829,7 +3834,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
38293834 for (int i=0 ;i<savestate_limit;++i)
38303835 {
38313836 float similaritybeat = ComputePrefixMatchPercent (savestates[i].savestate_context_tokens ,embd_inp);
3832- if (similaritybeat > similarity_threshold || CanContextShift (savestates[i].savestate_context_tokens , embd_inp, inputs.max_length , nctx))
3837+ if (similaritybeat > similarity_threshold || (shiftable && CanContextShift (savestates[i].savestate_context_tokens , embd_inp, inputs.max_length , nctx) ))
38333838 {
38343839 // found a match. save to the oldest slot thats not the one we are loading
38353840 int oldest_slot = get_oldest_slot (i);
0 commit comments