fixed savestates with drafting

LostRuins · LostRuins · commit 39b0699c7167 · 2025-06-27T20:35:38.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -4341,13 +4341,19 @@ size_t gpttype_calc_new_state_kv()
     }
     if(file_format == FileFormat::GGUF_GENERIC)
     {
-        return llama_state_get_size(llama_ctx_v4);
+        size_t s1 = llama_state_get_size(llama_ctx_v4);
+        if(draft_ctx)
+        {
+            size_t s2 = llama_state_get_size(draft_ctx);
+            s1 += s2;
+        }
+        return s1;
     }
     return 0;
 }
 size_t gpttype_calc_old_state_kv(int slot)
 {
-    return savestates[slot].current_savestate_size;
+    return savestates[slot].current_savestate_size + savestates[slot].current_draft_savestate_size;
 }
 size_t gpttype_calc_old_state_tokencount(int slot)
 {
@@ -4365,30 +4371,54 @@ size_t gpttype_save_state_kv(int slot)
     }
     if(file_format == FileFormat::GGUF_GENERIC)
     {
+        size_t totalbytes = 0;
         if (!savestates[slot].current_savestate_buffer.empty()) {  //JIT free
             savestates[slot].current_savestate_buffer.clear();
+            savestates[slot].current_draft_savestate_buffer.clear();
             savestates[slot].savestate_context_tokens.clear();
             savestates[slot].current_savestate_size = 0;
+            savestates[slot].current_draft_savestate_size = 0;
         }
         size_t newsize = llama_state_get_size(llama_ctx_v4);
         try {
             if (savestates[slot].current_savestate_buffer.capacity() < newsize + 512) {
-                savestates[slot].current_savestate_buffer = std::vector<uint8_t>(newsize + 512);
+                savestates[slot].current_savestate_buffer = std::vector<uint8_t>(newsize + 512); // add some padding. May throw std::bad_alloc
             } else {
                 savestates[slot].current_savestate_buffer.resize(newsize + 512);
             }
-            savestates[slot].current_savestate_buffer.resize(newsize + 512);  // add some padding. May throw std::bad_alloc
         } catch (const std::bad_alloc&) {
             fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize + 512);
             return 0;
         }
         auto res = llama_state_get_data(llama_ctx_v4, savestates[slot].current_savestate_buffer.data(), newsize);
         if (res > 0) {
+            totalbytes += res;
             savestates[slot].current_savestate_size   = newsize;
             savestates[slot].savestate_context_tokens = current_context_tokens;
             printf("\nKV Save State %d: Created SaveState of %zu tokens, costing %zu MB.\n",slot,current_context_tokens.size(),savestates[slot].current_savestate_size/(1024*1024));
         }
-        return res;
+
+        if(draft_ctx)
+        {
+            size_t newsize2 = llama_state_get_size(draft_ctx);
+            try {
+                if (savestates[slot].current_draft_savestate_buffer.capacity() < newsize2 + 512) {
+                    savestates[slot].current_draft_savestate_buffer = std::vector<uint8_t>(newsize2 + 512);
+                } else {
+                    savestates[slot].current_draft_savestate_buffer.resize(newsize2 + 512);
+                }
+            } catch (const std::bad_alloc&) {
+                fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize2 + 512);
+                return 0;
+            }
+            auto res2 = llama_state_get_data(draft_ctx, savestates[slot].current_draft_savestate_buffer.data(), newsize2);
+            if (res2 > 0) {
+                totalbytes += res2;
+                savestates[slot].current_draft_savestate_size = newsize2;
+                printf("\nKV Save State %d: Created DraftSaveState of %zu tokens, costing %zu MB.\n",slot,current_context_tokens.size(),savestates[slot].current_draft_savestate_size/(1024*1024));
+            }
+        }
+        return totalbytes;
     }
     return 0;
 }
@@ -4408,6 +4438,12 @@ bool gpttype_load_state_kv(int slot)
         {
             current_context_tokens = savestates[slot].savestate_context_tokens;
             printf("\nKV Load SaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());
+            if(draft_ctx && savestates[slot].current_draft_savestate_size>0)
+            {
+                llama_memory_clear(llama_get_memory(draft_ctx),true);
+                auto res2 = llama_state_set_data(draft_ctx, savestates[slot].current_draft_savestate_buffer.data(), savestates[slot].current_draft_savestate_size);
+                printf("\nKV Load DraftSaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());
+            }
         }
         return (res > 0);
     }
@@ -4432,6 +4468,15 @@ bool gpttype_clear_state_kv(bool shrink)
                 }
                 savestates[slot].savestate_context_tokens.clear();
                 savestates[slot].current_savestate_size = 0;
+                if(draft_ctx && savestates[slot].current_draft_savestate_size>0)
+                {
+                    savestates[slot].current_draft_savestate_buffer.clear();
+                    if(shrink)
+                    {
+                        savestates[slot].current_draft_savestate_buffer.shrink_to_fit();
+                    }
+                    savestates[slot].current_draft_savestate_size = 0;
+                }
             }
         }
         return true;
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
@@ -521,6 +521,8 @@ struct savestate_data
 {
     size_t current_savestate_size = 0;
     std::vector<uint8_t> current_savestate_buffer;
+    size_t current_draft_savestate_size = 0;
+    std::vector<uint8_t> current_draft_savestate_buffer;
     std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones
 };
 

Original file line number	Diff line number	Diff line change
`@@ -4341,13 +4341,19 @@ size_t gpttype_calc_new_state_kv()`
`4341`	`4341`	`}`
`4342`	`4342`	`if(file_format == FileFormat::GGUF_GENERIC)`
`4343`	`4343`	`{`
`4344`		`- return llama_state_get_size(llama_ctx_v4);`
	`4344`	`+ size_t s1 = llama_state_get_size(llama_ctx_v4);`
	`4345`	`+ if(draft_ctx)`
	`4346`	`+ {`
	`4347`	`+ size_t s2 = llama_state_get_size(draft_ctx);`
	`4348`	`+ s1 += s2;`
	`4349`	`+ }`
	`4350`	`+ return s1;`
`4345`	`4351`	`}`
`4346`	`4352`	`return 0;`
`4347`	`4353`	`}`
`4348`	`4354`	`size_t gpttype_calc_old_state_kv(int slot)`
`4349`	`4355`	`{`
`4350`		`- return savestates[slot].current_savestate_size;`
	`4356`	`+ return savestates[slot].current_savestate_size + savestates[slot].current_draft_savestate_size;`
`4351`	`4357`	`}`
`4352`	`4358`	`size_t gpttype_calc_old_state_tokencount(int slot)`
`4353`	`4359`	`{`
`@@ -4365,30 +4371,54 @@ size_t gpttype_save_state_kv(int slot)`
`4365`	`4371`	`}`
`4366`	`4372`	`if(file_format == FileFormat::GGUF_GENERIC)`
`4367`	`4373`	`{`
	`4374`	`+ size_t totalbytes = 0;`
`4368`	`4375`	`if (!savestates[slot].current_savestate_buffer.empty()) { //JIT free`
`4369`	`4376`	`savestates[slot].current_savestate_buffer.clear();`
	`4377`	`+ savestates[slot].current_draft_savestate_buffer.clear();`
`4370`	`4378`	`savestates[slot].savestate_context_tokens.clear();`
`4371`	`4379`	`savestates[slot].current_savestate_size = 0;`
	`4380`	`+ savestates[slot].current_draft_savestate_size = 0;`
`4372`	`4381`	`}`
`4373`	`4382`	`size_t newsize = llama_state_get_size(llama_ctx_v4);`
`4374`	`4383`	`try {`
`4375`	`4384`	`if (savestates[slot].current_savestate_buffer.capacity() < newsize + 512) {`
`4376`		`- savestates[slot].current_savestate_buffer = std::vector<uint8_t>(newsize + 512);`
	`4385`	`+ savestates[slot].current_savestate_buffer = std::vector<uint8_t>(newsize + 512); // add some padding. May throw std::bad_alloc`
`4377`	`4386`	`} else {`
`4378`	`4387`	`savestates[slot].current_savestate_buffer.resize(newsize + 512);`
`4379`	`4388`	`}`
`4380`		`- savestates[slot].current_savestate_buffer.resize(newsize + 512); // add some padding. May throw std::bad_alloc`
`4381`	`4389`	`} catch (const std::bad_alloc&) {`
`4382`	`4390`	`fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize + 512);`
`4383`	`4391`	`return 0;`
`4384`	`4392`	`}`
`4385`	`4393`	`auto res = llama_state_get_data(llama_ctx_v4, savestates[slot].current_savestate_buffer.data(), newsize);`
`4386`	`4394`	`if (res > 0) {`
	`4395`	`+ totalbytes += res;`
`4387`	`4396`	`savestates[slot].current_savestate_size = newsize;`
`4388`	`4397`	`savestates[slot].savestate_context_tokens = current_context_tokens;`
`4389`	`4398`	`printf("\nKV Save State %d: Created SaveState of %zu tokens, costing %zu MB.\n",slot,current_context_tokens.size(),savestates[slot].current_savestate_size/(1024*1024));`
`4390`	`4399`	`}`
`4391`		`- return res;`
	`4400`	`+`
	`4401`	`+ if(draft_ctx)`
	`4402`	`+ {`
	`4403`	`+ size_t newsize2 = llama_state_get_size(draft_ctx);`
	`4404`	`+ try {`
	`4405`	`+ if (savestates[slot].current_draft_savestate_buffer.capacity() < newsize2 + 512) {`
	`4406`	`+ savestates[slot].current_draft_savestate_buffer = std::vector<uint8_t>(newsize2 + 512);`
	`4407`	`+ } else {`
	`4408`	`+ savestates[slot].current_draft_savestate_buffer.resize(newsize2 + 512);`
	`4409`	`+ }`
	`4410`	`+ } catch (const std::bad_alloc&) {`
	`4411`	`+ fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize2 + 512);`
	`4412`	`+ return 0;`
	`4413`	`+ }`
	`4414`	`+ auto res2 = llama_state_get_data(draft_ctx, savestates[slot].current_draft_savestate_buffer.data(), newsize2);`
	`4415`	`+ if (res2 > 0) {`
	`4416`	`+ totalbytes += res2;`
	`4417`	`+ savestates[slot].current_draft_savestate_size = newsize2;`
	`4418`	`+ printf("\nKV Save State %d: Created DraftSaveState of %zu tokens, costing %zu MB.\n",slot,current_context_tokens.size(),savestates[slot].current_draft_savestate_size/(1024*1024));`
	`4419`	`+ }`
	`4420`	`+ }`
	`4421`	`+ return totalbytes;`
`4392`	`4422`	`}`
`4393`	`4423`	`return 0;`
`4394`	`4424`	`}`
`@@ -4408,6 +4438,12 @@ bool gpttype_load_state_kv(int slot)`
`4408`	`4438`	`{`
`4409`	`4439`	`current_context_tokens = savestates[slot].savestate_context_tokens;`
`4410`	`4440`	`printf("\nKV Load SaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());`
	`4441`	`+ if(draft_ctx && savestates[slot].current_draft_savestate_size>0)`
	`4442`	`+ {`
	`4443`	`+ llama_memory_clear(llama_get_memory(draft_ctx),true);`
	`4444`	`+ auto res2 = llama_state_set_data(draft_ctx, savestates[slot].current_draft_savestate_buffer.data(), savestates[slot].current_draft_savestate_size);`
	`4445`	`+ printf("\nKV Load DraftSaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());`
	`4446`	`+ }`
`4411`	`4447`	`}`
`4412`	`4448`	`return (res > 0);`
`4413`	`4449`	`}`
`@@ -4432,6 +4468,15 @@ bool gpttype_clear_state_kv(bool shrink)`
`4432`	`4468`	`}`
`4433`	`4469`	`savestates[slot].savestate_context_tokens.clear();`
`4434`	`4470`	`savestates[slot].current_savestate_size = 0;`
	`4471`	`+ if(draft_ctx && savestates[slot].current_draft_savestate_size>0)`
	`4472`	`+ {`
	`4473`	`+ savestates[slot].current_draft_savestate_buffer.clear();`
	`4474`	`+ if(shrink)`
	`4475`	`+ {`
	`4476`	`+ savestates[slot].current_draft_savestate_buffer.shrink_to_fit();`
	`4477`	`+ }`
	`4478`	`+ savestates[slot].current_draft_savestate_size = 0;`
	`4479`	`+ }`
`4435`	`4480`	`}`
`4436`	`4481`	`}`
`4437`	`4482`	`return true;`
Original file line number	Diff line number	Diff line change
`@@ -521,6 +521,8 @@ struct savestate_data`
`521`	`521`	`{`
`522`	`522`	`size_t current_savestate_size = 0;`
`523`	`523`	`std::vector<uint8_t> current_savestate_buffer;`
	`524`	`+ size_t current_draft_savestate_size = 0;`
	`525`	`+ std::vector<uint8_t> current_draft_savestate_buffer;`
`524`	`526`	`std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones`
`525`	`527`	`};`
`526`	`528`