speculative: use slot.cache_tokens.insert

ngxson · ngxson · commit bef122e1a532 · 2025-05-08T13:22:06.000+02:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3484,9 +3484,7 @@ struct server_context {
                 slot.n_draft_accepted += ids.size() - 1;
 
                 slot.cache_tokens.push_back(id);
-                for (auto & t : ids) {
-                    slot.cache_tokens.push_back(t);
-                }
+                slot.cache_tokens.insert(ids);
 
                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1124,7 +1124,7 @@ struct server_tokens {
     }
 
     // for compatibility with context shift and prompt truncation
-    void insert(llama_tokens & inp_tokens) {
+    void insert(const llama_tokens & inp_tokens) {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
     }

Original file line number	Diff line number	Diff line change
`@@ -1124,7 +1124,7 @@ struct server_tokens {`
`1124`	`1124`	`}`
`1125`	`1125`
`1126`	`1126`	`// for compatibility with context shift and prompt truncation`
`1127`		`- void insert(llama_tokens & inp_tokens) {`
	`1127`	`+ void insert(const llama_tokens & inp_tokens) {`
`1128`	`1128`	`GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled`
`1129`	`1129`	`tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());`
`1130`	`1130`	`}`