cursed hack for RNN models

LostRuins · LostRuins · commit e92f9fd42296 · 2025-10-11T23:14:55.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -630,7 +630,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
     {
         const llama_vocab * tmpvocab = llama_model_get_vocab(draftmodel);
         int draftvocab = llama_vocab_n_tokens(tmpvocab);
-        if(llama_model_is_recurrent(draftmodel))
+        if(llama_model_is_recurrent(draftmodel) || llama_model_is_hybrid(draftmodel))
         {
             printf("Error: Speculative decoding cannot be used with Recurrent draft models!\n");
             llama_free(draft_ctx);
@@ -2523,7 +2523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
 
         if(draftmodel_filename !="" && file_format==FileFormat::GGUF_GENERIC)
         {
-            if(llama_model_is_recurrent(llamamodel))
+            if(llama_model_is_recurrent(llamamodel) || llama_model_is_hybrid(llamamodel))
             {
                 printf("Error: Speculative decoding cannot be used with Recurrent models!\n");
             }
@@ -3758,7 +3758,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     if(file_format==FileFormat::GGUF_GENERIC)
     {
         const llama_model * mdl = llama_get_model(llama_ctx_v4);
-        if(llama_model_is_recurrent(mdl) || llama_model_is_hybrid(mdl))
+        if(llama_model_is_recurrent(mdl) || llama_model_is_hybrid(mdl) || file_format_meta.model_architecture==GGUFArch::ARCH_MAMBALIKE || file_format_meta.model_architecture==GGUFArch::ARCH_RWKV)
         {
             is_recurrent = true;
         }
@@ -3789,6 +3789,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 embd_inp.push_back(current_context_tokens[current_context_tokens.size()-1]);
                 n_past -= 1;
             }
+            else if(embd_inp.size()>0 && current_context_tokens.size()>0 && last_n_tokens.size()>0)
+            {
+                int maxedpos = llama_memory_seq_pos_max(llama_get_memory(llama_ctx_v4),0);
+                if(maxedpos+2==n_past)
+                {
+                    //kcpp: a very dirty hack for rnn models. this happens because the very last token of the last turn
+                    //does not actually get processed but is still added to current_context_tokens. if the instruct start tag starts with that same token
+                    //it might get wrongly fast forwarded and we will get an off by 1 error.
+                    //todo: figure out a better way to solve this rubbish
+                    int tail = last_n_tokens[last_n_tokens.size()-1];
+                    last_n_tokens.pop_back();
+                    current_context_tokens.pop_back();
+                    n_past -=1;
+                    embd_inp.insert(embd_inp.begin(), 1, tail);
+                }
+            }
         }
     }
     else

Original file line number	Diff line number	Diff line change
`@@ -630,7 +630,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll`
`630`	`630`	`{`
`631`	`631`	`const llama_vocab * tmpvocab = llama_model_get_vocab(draftmodel);`
`632`	`632`	`int draftvocab = llama_vocab_n_tokens(tmpvocab);`
`633`		`- if(llama_model_is_recurrent(draftmodel))`
	`633`	`+ if(llama_model_is_recurrent(draftmodel) \|\| llama_model_is_hybrid(draftmodel))`
`634`	`634`	`{`
`635`	`635`	`printf("Error: Speculative decoding cannot be used with Recurrent draft models!\n");`
`636`	`636`	`llama_free(draft_ctx);`
`@@ -2523,7 +2523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2523`	`2523`
`2524`	`2524`	`if(draftmodel_filename !="" && file_format==FileFormat::GGUF_GENERIC)`
`2525`	`2525`	`{`
`2526`		`- if(llama_model_is_recurrent(llamamodel))`
	`2526`	`+ if(llama_model_is_recurrent(llamamodel) \|\| llama_model_is_hybrid(llamamodel))`
`2527`	`2527`	`{`
`2528`	`2528`	`printf("Error: Speculative decoding cannot be used with Recurrent models!\n");`
`2529`	`2529`	`}`
`@@ -3758,7 +3758,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3758`	`3758`	`if(file_format==FileFormat::GGUF_GENERIC)`
`3759`	`3759`	`{`
`3760`	`3760`	`const llama_model * mdl = llama_get_model(llama_ctx_v4);`
`3761`		`- if(llama_model_is_recurrent(mdl) \|\| llama_model_is_hybrid(mdl))`
	`3761`	`+ if(llama_model_is_recurrent(mdl) \|\| llama_model_is_hybrid(mdl) \|\| file_format_meta.model_architecture==GGUFArch::ARCH_MAMBALIKE \|\| file_format_meta.model_architecture==GGUFArch::ARCH_RWKV)`
`3762`	`3762`	`{`
`3763`	`3763`	`is_recurrent = true;`
`3764`	`3764`	`}`
`@@ -3789,6 +3789,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3789`	`3789`	`embd_inp.push_back(current_context_tokens[current_context_tokens.size()-1]);`
`3790`	`3790`	`n_past -= 1;`
`3791`	`3791`	`}`
	`3792`	`+ else if(embd_inp.size()>0 && current_context_tokens.size()>0 && last_n_tokens.size()>0)`
	`3793`	`+ {`
	`3794`	`+ int maxedpos = llama_memory_seq_pos_max(llama_get_memory(llama_ctx_v4),0);`
	`3795`	`+ if(maxedpos+2==n_past)`
	`3796`	`+ {`
	`3797`	`+ //kcpp: a very dirty hack for rnn models. this happens because the very last token of the last turn`
	`3798`	`+ //does not actually get processed but is still added to current_context_tokens. if the instruct start tag starts with that same token`
	`3799`	`+ //it might get wrongly fast forwarded and we will get an off by 1 error.`
	`3800`	`+ //todo: figure out a better way to solve this rubbish`
	`3801`	`+ int tail = last_n_tokens[last_n_tokens.size()-1];`
	`3802`	`+ last_n_tokens.pop_back();`
	`3803`	`+ current_context_tokens.pop_back();`
	`3804`	`+ n_past -=1;`
	`3805`	`+ embd_inp.insert(embd_inp.begin(), 1, tail);`
	`3806`	`+ }`
	`3807`	`+ }`
`3792`	`3808`	`}`
`3793`	`3809`	`}`
`3794`	`3810`	`else`