fix to allow all EOGs to trigger a stop, occam's glm4 fix,

LostRuins · LostRuins · commit f97bbdde000c · 2025-05-24T22:55:11.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -301,14 +301,29 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
     }
     return eosID;
 }
-static int GetEotID(FileFormat file_format)
+
+static std::vector<int> GetEogIDs(FileFormat file_format, int32_t n_vocab)
 {
+    std::vector<int> alleogs;
+    int eos = GetEosID(file_format, n_vocab);
     if(file_format == FileFormat::GGUF_GENERIC)
     {
         const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));
-        return llama_vocab_eot(tmpvocab);
+        int eot = llama_vocab_eot(tmpvocab);
+        std::set<int> eogs = tmpvocab->get_eogs();
+        if (eot >= 0) {
+            eogs.insert(eot);
+        }
+        if (eos >= 0) {
+            eogs.insert(eos);
+        }
+        alleogs = std::vector<int>(eogs.begin(), eogs.end());
+    } else {
+        if (eos >= 0) {
+            alleogs.push_back(eos);
+        }
     }
-    return -1;
+    return alleogs;
 }
 
 static float LowestLogit(const std::vector<float> & logits)
@@ -1550,16 +1565,16 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar
         }
     }
 
-    const llama_token eos = GetEosID(file_format,n_vocab);
-    const llama_token eot = GetEotID(file_format);
+    const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);
 
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
     std::vector<llama_grammar_candidate>                              candidates_grammar;
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
         const std::string piece = FileFormatTokenizeID(id,file_format);
-        if (id == eos || (id==eot && id!=-1)) {
+        bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), id) != eog_tokens.end();
+        if (found_eog) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
             }
@@ -1711,7 +1726,9 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
 
 static void grammar_accept_token(FileFormat file_format, int32_t n_vocab, struct llama_grammar * grammar, llama_token token)
 {
-    if (token == GetEosID(file_format,n_vocab) || (token!=-1 && token == GetEotID(file_format))) {
+    const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);
+    bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), token) != eog_tokens.end();
+    if (found_eog) {
         for (const auto & stack : grammar->stacks) {
             if (stack.empty()) {
                 return;
@@ -3827,8 +3844,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
             }
 
-            unsigned int eosID = GetEosID(file_format, n_vocab);
-            unsigned int eotID = GetEotID(file_format);
+            const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);
             float * logitsPtr;
             float lowestLogit = 0;
             int btsize = banned_token_ids.size();
@@ -3886,13 +3902,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 if (!inputs.allow_eos_token && !inputs.bypass_eos_token)
                 {
                     // set the logit of the eos token to very low to avoid sampling it
-                    if(eosID!=LLAMA_TOKEN_NULL)
-                    {
-                        logitsPtr[eosID] = lowestLogit;
-                    }
-                    if(eotID!=-1)
+                    for(int i=0;i<eog_tokens.size();++i)
                     {
-                        logitsPtr[eotID] = lowestLogit;
+                         logitsPtr[eog_tokens[i]] = lowestLogit;
                     }
                 }
                 if(btsize>0)
@@ -3958,7 +3970,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 for (auto eid : embd)
                 {
                     std::string tokenizedstr = FileFormatTokenizeID(eid, file_format, inputs.render_special);
-                    if(!inputs.render_special && (eid==eosID || (eid==eotID && eid!=-1) || VecContainsIntVal(special_stop_sequence,id))) //extra filter to avoid unwanted special tokens
+                    bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), eid) != eog_tokens.end();
+                    if(!inputs.render_special && (found_eog || VecContainsIntVal(special_stop_sequence,id))) //extra filter to avoid unwanted special tokens
                     {
                         tokenizedstr = ""; //prevent render
                     }
@@ -4059,7 +4072,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
 
                 if(!early_abort)
                 {
-                    if(!inputs.bypass_eos_token && inputs.allow_eos_token && (id==eosID || (id==eotID && id!=-1)))
+                    bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), id) != eog_tokens.end();
+                    if(!inputs.bypass_eos_token && inputs.allow_eos_token && found_eog)
                     {
                         if(allow_regular_prints)
                         {
diff --git a/include/llama.h b/include/llama.h
@@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
+#include <set>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -941,6 +942,8 @@ extern "C" {
     LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
     LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
 
+    LLAMA_API std::set<int> llama_vocab_get_eogs(const struct llama_vocab * vocab);
+
     LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
     LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
 
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -52,7 +52,7 @@
 dry_seq_break_max = 128
 
 # global vars
-KcppVersion = "1.92"
+KcppVersion = "1.92.1"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1287,6 +1287,10 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
     }
 
     if (wo_b) {
@@ -1367,10 +1371,6 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
-        }
     }
 
     if (wo_b) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -1538,6 +1538,7 @@ struct llama_vocab::impl {
     bool is_user_defined(llama_token id) const;
     bool is_unused      (llama_token id) const;
     bool is_eog         (llama_token id) const;
+    std::set<int> get_eogs() const;
 
     uint8_t token_to_byte(llama_token id) const;
 
@@ -2396,6 +2397,10 @@ bool llama_vocab::impl::is_eog(llama_token id) const {
     return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
 }
 
+std::set<int> llama_vocab::impl::get_eogs() const {
+    return special_eog_ids;
+}
+
 uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
     GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
     GGML_ASSERT(is_byte(id));
@@ -3121,6 +3126,10 @@ bool llama_vocab::is_eog(llama_token id) const {
     return pimpl->is_eog(id);
 }
 
+std::set<int> llama_vocab::get_eogs() const {
+    return pimpl->get_eogs();
+}
+
 uint8_t llama_vocab::token_to_byte(llama_token id) const {
     return pimpl->token_to_byte(id);
 }
@@ -3431,6 +3440,11 @@ llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
     return vocab->token_eot();
 }
 
+std::set<int> llama_vocab_get_eogs(const struct llama_vocab * vocab)
+{
+    return vocab->get_eogs();
+}
+
 // deprecated
 llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
     return vocab->token_bos();
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -40,6 +40,7 @@ struct llama_vocab {
     bool is_user_defined(llama_token id) const;
     bool is_unused      (llama_token id) const;
     bool is_eog         (llama_token id) const;
+    std::set<int> get_eogs() const;
 
     uint8_t     token_to_byte(llama_token id) const;
     llama_token byte_to_token(uint8_t ch)     const;

Original file line number	Diff line number	Diff line change
`@@ -301,14 +301,29 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)`
`301`	`301`	`}`
`302`	`302`	`return eosID;`
`303`	`303`	`}`
`304`		`-static int GetEotID(FileFormat file_format)`
	`304`	`+`
	`305`	`+static std::vector<int> GetEogIDs(FileFormat file_format, int32_t n_vocab)`
`305`	`306`	`{`
	`307`	`+ std::vector<int> alleogs;`
	`308`	`+ int eos = GetEosID(file_format, n_vocab);`
`306`	`309`	`if(file_format == FileFormat::GGUF_GENERIC)`
`307`	`310`	`{`
`308`	`311`	`const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));`
`309`		`- return llama_vocab_eot(tmpvocab);`
	`312`	`+ int eot = llama_vocab_eot(tmpvocab);`
	`313`	`+ std::set<int> eogs = tmpvocab->get_eogs();`
	`314`	`+ if (eot >= 0) {`
	`315`	`+ eogs.insert(eot);`
	`316`	`+ }`
	`317`	`+ if (eos >= 0) {`
	`318`	`+ eogs.insert(eos);`
	`319`	`+ }`
	`320`	`+ alleogs = std::vector<int>(eogs.begin(), eogs.end());`
	`321`	`+ } else {`
	`322`	`+ if (eos >= 0) {`
	`323`	`+ alleogs.push_back(eos);`
	`324`	`+ }`
`310`	`325`	`}`
`311`		`- return -1;`
	`326`	`+ return alleogs;`
`312`	`327`	`}`
`313`	`328`
`314`	`329`	`static float LowestLogit(const std::vector<float> & logits)`
`@@ -1550,16 +1565,16 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar`
`1550`	`1565`	`}`
`1551`	`1566`	`}`
`1552`	`1567`
`1553`		`- const llama_token eos = GetEosID(file_format,n_vocab);`
`1554`		`- const llama_token eot = GetEotID(file_format);`
	`1568`	`+ const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);`
`1555`	`1569`
`1556`	`1570`	`std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;`
`1557`	`1571`	`std::vector<llama_grammar_candidate> candidates_grammar;`
`1558`	`1572`
`1559`	`1573`	`for (size_t i = 0; i < candidates->size; ++i) {`
`1560`	`1574`	`const llama_token id = candidates->data[i].id;`
`1561`	`1575`	`const std::string piece = FileFormatTokenizeID(id,file_format);`
`1562`		`- if (id == eos \|\| (id==eot && id!=-1)) {`
	`1576`	`+ bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), id) != eog_tokens.end();`
	`1577`	`+ if (found_eog) {`
`1563`	`1578`	`if (!allow_eos) {`
`1564`	`1579`	`candidates->data[i].logit = -INFINITY;`
`1565`	`1580`	`}`
`@@ -1711,7 +1726,9 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna`
`1711`	`1726`
`1712`	`1727`	`static void grammar_accept_token(FileFormat file_format, int32_t n_vocab, struct llama_grammar * grammar, llama_token token)`
`1713`	`1728`	`{`
`1714`		`- if (token == GetEosID(file_format,n_vocab) \|\| (token!=-1 && token == GetEotID(file_format))) {`
	`1729`	`+ const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);`
	`1730`	`+ bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), token) != eog_tokens.end();`
	`1731`	`+ if (found_eog) {`
`1715`	`1732`	`for (const auto & stack : grammar->stacks) {`
`1716`	`1733`	`if (stack.empty()) {`
`1717`	`1734`	`return;`
`@@ -3827,8 +3844,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3827`	`3844`	`}`
`3828`	`3845`	`}`
`3829`	`3846`
`3830`		`- unsigned int eosID = GetEosID(file_format, n_vocab);`
`3831`		`- unsigned int eotID = GetEotID(file_format);`
	`3847`	`+ const std::vector<llama_token> eog_tokens = GetEogIDs(file_format,n_vocab);`
`3832`	`3848`	`float * logitsPtr;`
`3833`	`3849`	`float lowestLogit = 0;`
`3834`	`3850`	`int btsize = banned_token_ids.size();`
`@@ -3886,13 +3902,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3886`	`3902`	`if (!inputs.allow_eos_token && !inputs.bypass_eos_token)`
`3887`	`3903`	`{`
`3888`	`3904`	`// set the logit of the eos token to very low to avoid sampling it`
`3889`		`- if(eosID!=LLAMA_TOKEN_NULL)`
`3890`		`- {`
`3891`		`- logitsPtr[eosID] = lowestLogit;`
`3892`		`- }`
`3893`		`- if(eotID!=-1)`
	`3905`	`+ for(int i=0;i<eog_tokens.size();++i)`
`3894`	`3906`	`{`
`3895`		`- logitsPtr[eotID] = lowestLogit;`
	`3907`	`+ logitsPtr[eog_tokens[i]] = lowestLogit;`
`3896`	`3908`	`}`
`3897`	`3909`	`}`
`3898`	`3910`	`if(btsize>0)`
`@@ -3958,7 +3970,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3958`	`3970`	`for (auto eid : embd)`
`3959`	`3971`	`{`
`3960`	`3972`	`std::string tokenizedstr = FileFormatTokenizeID(eid, file_format, inputs.render_special);`
`3961`		`- if(!inputs.render_special && (eid==eosID \|\| (eid==eotID && eid!=-1) \|\| VecContainsIntVal(special_stop_sequence,id))) //extra filter to avoid unwanted special tokens`
	`3973`	`+ bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), eid) != eog_tokens.end();`
	`3974`	`+ if(!inputs.render_special && (found_eog \|\| VecContainsIntVal(special_stop_sequence,id))) //extra filter to avoid unwanted special tokens`
`3962`	`3975`	`{`
`3963`	`3976`	`tokenizedstr = ""; //prevent render`
`3964`	`3977`	`}`
`@@ -4059,7 +4072,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4059`	`4072`
`4060`	`4073`	`if(!early_abort)`
`4061`	`4074`	`{`
`4062`		`- if(!inputs.bypass_eos_token && inputs.allow_eos_token && (id==eosID \|\| (id==eotID && id!=-1)))`
	`4075`	`+ bool found_eog = std::find(eog_tokens.begin(), eog_tokens.end(), id) != eog_tokens.end();`
	`4076`	`+ if(!inputs.bypass_eos_token && inputs.allow_eos_token && found_eog)`
`4063`	`4077`	`{`
`4064`	`4078`	`if(allow_regular_prints)`
`4065`	`4079`	`{`