LostRuins
diff --git a/‎expose.h‎
Lines changed: 1 addition & 0 deletions b/‎expose.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpttype_adapter.cpp‎
Lines changed: 133 additions & 23 deletions b/‎gpttype_adapter.cpp‎
Lines changed: 133 additions & 23 deletions
diff --git a/‎koboldcpp.py‎
Lines changed: 16 additions & 5 deletions b/‎koboldcpp.py‎
Lines changed: 16 additions & 5 deletions
@@ -75,6 +75,7 @@ struct load_model_inputs
     const bool check_slowness = false;
     const bool highpriority = false;
     const bool swa_support = false;
+    const bool smartcache = false;
     const float lora_multiplier = 1.0f;
     const bool quiet = false;
     const int debugmode = 0;
 
@@ -21,6 +21,7 @@
 #include <string>
 #include <cctype>
 #include <locale>
+#include <chrono>
 
 #include "utils.h"
 
@@ -151,7 +152,7 @@ static int delayed_generated_tokens_limit = 0;
 std::deque<std::string> delayed_generated_tokens; //for use with antislop sampling
 static std::map<int,std::vector<int>> antislop_banned_token_ids; //first is the npast position, second is the array of banned ids at that index
 
-const int savestate_limit = 3;
+const int savestate_limit = 4;
 static savestate_data savestates[savestate_limit];
 
 inline int kcpp_cpu_has_blas(void) {
@@ -1826,9 +1827,29 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
     return true;
 }
 
+//counts the number of matching prefix tokens between two sequences, returns percentage matched 0.0 to 1.0
+float ComputePrefixMatchPercent(std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens)
+{
+    int match_count = 0;
+    size_t min_length = std::min(current_context_tokens.size(), new_context_tokens.size());
+    for (size_t i = 0; i < min_length; ++i) {
+        if (current_context_tokens[i] == new_context_tokens[i]) {
+            match_count++;
+        } else {
+            break;
+        }
+    }
+    // Handle case where both sequences are empty to avoid division by zero
+    if (min_length == 0) {
+        return 0.0f; // Both empty sequences are considered 100% matched
+    }
+    return static_cast<float>(match_count) / static_cast<float>(min_length);
+}
+
 //given an old GGUF context and a new context that has some middle portion removed,
 //find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
-void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
+//returns true if contextshift is doable, executes it if dryrun is false
+bool DoContextShifting(llama_context * ctx, llama_context * draft_ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx, bool dryrun)
 {
     //scan from start old and new ctx, until first mismatch found, save as p0
     //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
@@ -1860,11 +1881,9 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
         }
     }
 
-    //printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
-
     if(!purgeneeded || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < ShortfallThreshold)
     {
-        return; //no purge is needed
+        return false; //no purge is needed
     }
 
     //at least this many tokens need to match, otherwise don't bother trimming
@@ -1881,30 +1900,38 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
         int found = ArrFindIndexOf(current_context_tokens,shared);
         if(found>=0 && found > trimstart)
         {
-
-            //extract the unwanted tokens out from context and KV
-            int diff = found - trimstart;
-            llama_memory_seq_rm(llama_get_memory(ctx), 0, trimstart, trimstart + diff);
-            llama_memory_seq_add(llama_get_memory(ctx), 0, trimstart + diff, -1, -diff);
-            if(draft_ctx)
+            if(!dryrun)
             {
-                llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, trimstart, trimstart + diff);
-                llama_memory_seq_add(llama_get_memory(draft_ctx), 0, trimstart + diff, -1, -diff);
-            }
-
-            for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
-            {
-                current_context_tokens[i - diff] = current_context_tokens[i];
+                //extract the unwanted tokens out from context and KV
+                int diff = found - trimstart;
+                llama_memory_seq_rm(llama_get_memory(ctx), 0, trimstart, trimstart + diff);
+                llama_memory_seq_add(llama_get_memory(ctx), 0, trimstart + diff, -1, -diff);
+                if(draft_ctx)
+                {
+                    llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, trimstart, trimstart + diff);
+                    llama_memory_seq_add(llama_get_memory(draft_ctx), 0, trimstart + diff, -1, -diff);
+                }
+                for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
+                {
+                    current_context_tokens[i - diff] = current_context_tokens[i];
+                }
+                printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
+                current_context_tokens.resize(current_context_tokens.size() - diff);
             }
-
-            printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
-
-            current_context_tokens.resize(current_context_tokens.size() - diff);
+            return true;
         }
     }
+    return false;
+
+}
 
+//returns true if context shifting is possible. does not execute the shift
+bool CanContextShift(std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
+{
+    return DoContextShifting(nullptr,nullptr,current_context_tokens,new_context_tokens,genamt,nctx,true);
 }
 
+
 static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
 {
     //check if approved to use BLAS
@@ -1978,6 +2005,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     kcpp_data->use_smartcontext = inputs.use_smartcontext;
     kcpp_data->use_contextshift = inputs.use_contextshift;
     kcpp_data->use_fastforward = inputs.use_fastforward;
+    kcpp_data->smartcache = inputs.smartcache;
+    if(!kcpp_data->use_fastforward && kcpp_data->smartcache)
+    {
+        kcpp_data->smartcache = false;
+        printf("\nSmartCache IS DISABLED!\nSmartCache requires Fast Forwarding!\n");
+    }
     kcpp_data->swa_full = !inputs.swa_support;
     if (!kcpp_data->swa_full) {
         if (inputs.use_contextshift) {
@@ -3776,6 +3809,61 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     }
     bool blank_prompt = (addedmemory=="" && kcpp_data->prompt=="");
 
+    //smart cache logic
+    if(kcpp_data->smartcache)
+    {
+        const float similarity_threshold = 0.7f;
+        //If CanBeShifted is true, do nothing. Allow shift as normal.
+        if(!CanContextShift(current_context_tokens, embd_inp, inputs.max_length, nctx))
+        {
+            // If CanBeShifted is false, calculate prefix similarity with current_context_tokens of current context
+            // If similarity > similarity_threshold, do nothing. Allow fast forward as normal.
+            float similarity = ComputePrefixMatchPercent(current_context_tokens,embd_inp);
+            if(similarity < similarity_threshold)
+            {
+                // Otherwise, for each of the currently used kv state slots, calculate ComputePrefixMatch and CanBeShifted
+                // If similarity to any of them > similarity_threshold or CanBeShifted, save current slot and switch to that slot.
+                // Whenever loading or saving current slot, simply tag the slot with a timestamp. When running out of slots after all 3 are used, delete the oldest timestamped slot.
+                // Slot loading and saving completely reuses gpttype_load_state_kv and gpttype_save_state_kv, nothing else is needed.
+                bool foundswap = false;
+                for(int i=0;i<savestate_limit;++i)
+                {
+                    float similaritybeat = ComputePrefixMatchPercent(savestates[i].savestate_context_tokens,embd_inp);
+                    if(similaritybeat > similarity_threshold || CanContextShift(savestates[i].savestate_context_tokens, embd_inp, inputs.max_length, nctx))
+                    {
+                        //found a match. save to the oldest slot thats not the one we are loading
+                        int oldest_slot = get_oldest_slot(i);
+                        if(oldest_slot!=i)
+                        {
+                            if(current_context_tokens.size()>32) //do not save tiny contexts
+                            {
+                                printf("\n[SmartCache Match of %.2f in slot %d. Saving into slot %d and switching...]",similaritybeat,i,oldest_slot);
+                                gpttype_save_state_kv(oldest_slot);
+                            }
+                            else
+                            {
+                                printf("\n[SmartCache Match of %.2f in slot %d. Switching...]",similaritybeat,i);
+
+                            }
+                            gpttype_load_state_kv(i);
+                            foundswap = true;
+                            break;
+                        }
+                    }
+                }
+                if(!foundswap) //could not match anything, just save kv and continue
+                {
+                    if(current_context_tokens.size()>32) //do not save tiny contexts
+                    {
+                        int oldest_slot = get_oldest_slot(-1);
+                        printf("\n[SmartCache No Match, Saving into slot %d...]",oldest_slot);
+                        gpttype_save_state_kv(oldest_slot);
+                    }
+                }
+            }
+        }
+    }
+
     if (file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2 || is_recurrent)
     {
         if(!blank_prompt)
@@ -3825,7 +3913,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         {
             if(kcpp_data->use_fastforward && kcpp_data->use_contextshift && (file_format == FileFormat::GGUF_GENERIC))
             {
-                PurgeMissingTokens(llama_ctx_v4, draft_ctx, current_context_tokens, embd_inp, inputs.max_length, nctx);
+                DoContextShifting(llama_ctx_v4, draft_ctx, current_context_tokens, embd_inp, inputs.max_length, nctx, false);
                 triggersc = false;
             }
             if(kcpp_data->use_fastforward)
@@ -4709,6 +4797,9 @@ size_t gpttype_save_state_kv(int slot)
             totalbytes += res;
             savestates[slot].current_savestate_size   = newsize;
             savestates[slot].savestate_context_tokens = current_context_tokens;
+            auto timenow = std::chrono::system_clock::now();
+            auto timestamp = std::chrono::duration_cast<std::chrono::seconds>(timenow.time_since_epoch()).count();
+            savestates[slot].last_used = timestamp;
             printf("\nKV Save State %d: Created SaveState of %zu tokens, costing %zu MB.\n",slot,current_context_tokens.size(),savestates[slot].current_savestate_size/(1024*1024));
         }
 
@@ -4758,6 +4849,9 @@ bool gpttype_load_state_kv(int slot)
                 auto res2 = llama_state_set_data(draft_ctx, savestates[slot].current_draft_savestate_buffer.data(), savestates[slot].current_draft_savestate_size);
                 printf("\nKV Load DraftSaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());
             }
+            auto timenow = std::chrono::system_clock::now();
+            auto timestamp = std::chrono::duration_cast<std::chrono::seconds>(timenow.time_since_epoch()).count();
+            savestates[slot].last_used = timestamp;
         }
         return (res > 0);
     }
@@ -4791,9 +4885,25 @@ bool gpttype_clear_state_kv(bool shrink)
                     }
                     savestates[slot].current_draft_savestate_size = 0;
                 }
+                savestates[slot].last_used = 0;
             }
         }
         return true;
     }
     return false;
 }
+
+int get_oldest_slot(int excludeSlotId)
+{
+    int64_t slotage = INT64_MAX; // Initialize with maximum possible value
+    int slotid = 0;
+    for(int i=0;i<savestate_limit;++i)
+    {
+        if(savestates[i].last_used <= slotage && i!=excludeSlotId)
+        {
+            slotage = savestates[i].last_used;
+            slotid = i;
+        }
+    }
+    return slotid;
+}
@@ -53,7 +53,7 @@
 default_ttsmaxlen = 4096
 default_visionmaxres = 1024
 net_save_slots = 12
-savestate_limit = 3 #3 savestate slots
+savestate_limit = 4 #savestate slots
 default_vae_tile_threshold = 768
 default_native_ctx = 16384
 overridekv_max = 4
@@ -217,6 +217,7 @@ class load_model_inputs(ctypes.Structure):
                 ("check_slowness", ctypes.c_bool),
                 ("highpriority", ctypes.c_bool),
                 ("swa_support", ctypes.c_bool),
+                ("smartcache", ctypes.c_bool),
                 ("lora_multiplier", ctypes.c_float),
                 ("quiet", ctypes.c_bool),
                 ("debugmode", ctypes.c_int)]
@@ -1519,6 +1520,7 @@ def load_model(model_filename):
     inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
     inputs.highpriority = args.highpriority
     inputs.swa_support = args.useswa
+    inputs.smartcache = args.smartcache
     inputs = set_backend_props(inputs)
     ret = handle.load_model(inputs)
     return ret
@@ -4344,7 +4346,7 @@ def do_POST(self):
             if self.path.endswith('/api/admin/check_state'):
                 if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                     cur_states = []
-                    for sl in range(savestate_limit): #0,1,2
+                    for sl in range(savestate_limit): #0,1,2,3
                         oldstate = handle.calc_old_state_kv(sl)
                         oldtokencnt = handle.calc_old_state_tokencount(sl)
                         cur_states.append({"tokens":oldtokencnt,"size":oldstate})
@@ -4997,8 +4999,8 @@ def get_problematic_scaler():
     import customtkinter as ctk
     nextstate = 0 #0=exit, 1=launch
     corrupt_scaler = get_problematic_scaler()
-    original_windowwidth = int(860 if corrupt_scaler else 580)
-    original_windowheight = int(740 if corrupt_scaler else 580)
+    original_windowwidth = int(860 if corrupt_scaler else 584)
+    original_windowheight = int(740 if corrupt_scaler else 584)
     windowwidth = original_windowwidth
     windowheight = original_windowheight
     ctk.set_appearance_mode("dark")
@@ -5160,6 +5162,7 @@ def hide_tooltip(event):
     contextshift_var = ctk.IntVar(value=1)
     fastforward_var = ctk.IntVar(value=1)
     swa_var = ctk.IntVar(value=0)
+    smartcache_var = ctk.IntVar(value=0)
     remotetunnel_var = ctk.IntVar(value=0)
     smartcontext_var = ctk.IntVar()
     flashattention_var = ctk.IntVar(value=0)
@@ -5626,6 +5629,10 @@ def toggleswa(a,b,c):
         if swa_var.get()==1:
             contextshift_var.set(0)
 
+    def togglesmartcache(a,b,c):
+        if smartcache_var.get()==1:
+            fastforward_var.set(1)
+
     def togglefastforward(a,b,c):
         if fastforward_var.get()==0:
             contextshift_var.set(0)
@@ -5845,6 +5852,7 @@ def changerunmode(a,b,c):
     makecheckbox(tokens_tab, "Use ContextShift", contextshift_var, 2,tooltiptxt="Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info.", command=togglectxshift)
     makecheckbox(tokens_tab, "Use FastForwarding", fastforward_var, 3,tooltiptxt="Use fast forwarding to recycle previous context (always reprocess if disabled).\nRecommended.", command=togglefastforward)
     makecheckbox(tokens_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
+    makecheckbox(tokens_tab, "Use SmartCache", smartcache_var, 5,tooltiptxt="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", command=togglesmartcache)
 
     # context size
     makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
@@ -5854,7 +5862,7 @@ def changerunmode(a,b,c):
 
     nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=(246 if corrupt_scaler else 146), singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.")
     customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=(160 if corrupt_scaler else 100), singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
-    customrope_base_entry, customrope_base_label = makelabelentry(tokens_tab, "RoPE Base:", customrope_base, row=24, padx=(160 if corrupt_scaler else 100), singleline=True, tooltip="For NTK Aware Scaling. RoPE frequency base.")
+    customrope_base_entry, customrope_base_label = makelabelentry(tokens_tab, "Base:", customrope_base, row=23, padx=(420 if corrupt_scaler else 220), singleline=True, tooltip="For NTK Aware Scaling. RoPE frequency base.",labelpadx=(280 if corrupt_scaler else 180))
     def togglerope(a,b,c):
         if customrope_var.get() == 1:
             manualropebox.grid()
@@ -6143,6 +6151,7 @@ def export_vars():
         args.noshift = contextshift_var.get()==0
         args.nofastforward = fastforward_var.get()==0
         args.useswa = swa_var.get()==1
+        args.smartcache = smartcache_var.get()==1
         args.remotetunnel = remotetunnel_var.get()==1
         args.foreground = keepforeground.get()==1
         args.cli = terminalonly.get()==1
@@ -6364,6 +6373,7 @@ def import_vars(dict):
         contextshift_var.set(0 if "noshift" in dict and dict["noshift"] else 1)
         fastforward_var.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1)
         swa_var.set(1 if "useswa" in dict and dict["useswa"] else 0)
+        smartcache_var.set(1 if "smartcache" in dict and dict["smartcache"] else 0)
         remotetunnel_var.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
         keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
         terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
@@ -8335,6 +8345,7 @@ def range_checker(arg: str):
     advparser.add_argument("--noshift","--no-context-shift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
     advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true')
     advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true')
+    advparser.add_argument("--smartcache", help="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", action='store_true')
     advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
     advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0)
     compatgroup3 = advparser.add_mutually_exclusive_group()