getnamo
diff --git a/‎Llama.uplugin‎
Lines changed: 1 addition & 1 deletion b/‎Llama.uplugin‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Source/LlamaCore/Private/Internal/LlamaInternal.cpp‎
Lines changed: 7 additions & 7 deletions b/‎Source/LlamaCore/Private/Internal/LlamaInternal.cpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎ThirdParty/LlamaCpp/Include/common/common.h‎
Lines changed: 1 addition & 1 deletion b/‎ThirdParty/LlamaCpp/Include/common/common.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ThirdParty/LlamaCpp/Include/ggml.h‎
Lines changed: 24 additions & 0 deletions b/‎ThirdParty/LlamaCpp/Include/ggml.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎ThirdParty/LlamaCpp/Include/llama.h‎
Lines changed: 87 additions & 22 deletions b/‎ThirdParty/LlamaCpp/Include/llama.h‎
Lines changed: 87 additions & 22 deletions
diff --git a/‎ThirdParty/LlamaCpp/Lib/Win64/common.lib‎
5.46 KB b/‎ThirdParty/LlamaCpp/Lib/Win64/common.lib‎
5.46 KB
diff --git a/‎ThirdParty/LlamaCpp/Lib/Win64/ggml-base.lib‎
614 Bytes b/‎ThirdParty/LlamaCpp/Lib/Win64/ggml-base.lib‎
614 Bytes
diff --git a/‎ThirdParty/LlamaCpp/Lib/Win64/llama.lib‎
3.11 KB b/‎ThirdParty/LlamaCpp/Lib/Win64/llama.lib‎
3.11 KB
@@ -1,7 +1,7 @@
 {
 	"FileVersion": 3,
 	"Version": 1,
-	"VersionName": "0.9.2",
+	"VersionName": "0.9.3",
 	"FriendlyName": "Llama",
 	"Description": "Llama.cpp plugin for large language model (LLM) inference.",
 	"Category": "LLM",
 
@@ -273,7 +273,7 @@ int32 FLlamaInternal::UsedContext()
 {
     if (Context)
     {
-        return llama_get_kv_cache_used_cells(Context);
+        return llama_kv_self_used_cells(Context);
     }
     else
     {
@@ -313,7 +313,7 @@ void FLlamaInternal::ResetContextHistory(bool bKeepSystemsPrompt)
     ContextHistory.clear();
     Messages.clear();
 
-    llama_kv_cache_clear(Context);
+    llama_kv_self_clear(Context);
     FilledContextCharLength = 0;
 }
 
@@ -322,7 +322,7 @@ void FLlamaInternal::RollbackContextHistoryByTokens(int32 NTokensToErase)
     // clear the last n_regen tokens from the KV cache and update n_past
     int32 TokensUsed = llama_get_kv_cache_used_cells(Context); //FilledContextCharLength
 
-    llama_kv_cache_seq_rm(Context, 0, TokensUsed - NTokensToErase, -1);
+    llama_kv_self_seq_rm(Context, 0, TokensUsed - NTokensToErase, -1);
 
     //FilledContextCharLength -= NTokensToErase;
 
@@ -442,7 +442,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
     //Grab vocab
     const llama_vocab* Vocab = llama_model_get_vocab(LlamaModel);
-    const bool IsFirst = llama_get_kv_cache_used_cells(Context) == 0;
+    const bool IsFirst = llama_kv_self_used_cells(Context) == 0;
 
     // tokenize the prompt
     const int NPromptTokens = -llama_tokenize(Vocab, Prompt.c_str(), Prompt.size(), NULL, 0, IsFirst, true);
@@ -461,7 +461,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
         //check sizing before running prompt decode
         int NContext = llama_n_ctx(Context);
-        int NContextUsed = llama_get_kv_cache_used_cells(Context);
+        int NContextUsed = llama_kv_self_used_cells(Context);
 
         if (NContextUsed + NPromptTokens > NContext)
         {
@@ -506,7 +506,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
             // Check context before running decode
             int NContext = llama_n_ctx(Context);
-            int NContextUsed = llama_get_kv_cache_used_cells(Context);
+            int NContextUsed = llama_kv_self_used_cells(Context);
 
             if (NContextUsed + BatchTokens.size() > NContext)
             {
@@ -563,7 +563,7 @@ std::string FLlamaInternal::Generate(const std::string& Prompt, bool bAppendToMe
 
     // check if we have enough space in the context to evaluate this batch - might need to be inside loop
     int NContext = llama_n_ctx(Context);
-    int NContextUsed = llama_get_kv_cache_used_cells(Context);
+    int NContextUsed = llama_kv_self_used_cells(Context);
     bool bEOGExit = false;
 
     while (bGenerationActive) //processing can be aborted by flipping the boolean
 
@@ -36,7 +36,7 @@ using llama_tokens = std::vector<llama_token>;
 
 // build info
 int LLAMA_BUILD_NUMBER = 0;
-const char* LLAMA_COMMIT = "f08f4b3187b691bb08a8884ed39ebaa94e956707";
+const char* LLAMA_COMMIT = "ef19c71769681a0b3dde6bc90911728376e5d236";
 const char* LLAMA_COMPILER = "";
 const char* LLAMA_BUILD_TARGET = "Vulkan - Unreal";
 
 
@@ -454,6 +454,7 @@ extern "C" {
         GGML_OP_RMS_NORM,
         GGML_OP_RMS_NORM_BACK,
         GGML_OP_GROUP_NORM,
+        GGML_OP_L2_NORM,
 
         GGML_OP_MUL_MAT,
         GGML_OP_MUL_MAT_ID,
@@ -502,6 +503,7 @@ extern "C" {
         GGML_OP_ADD_REL_POS,
         GGML_OP_RWKV_WKV6,
         GGML_OP_GATED_LINEAR_ATTN,
+        GGML_OP_RWKV_WKV7,
 
         GGML_OP_UNARY,
 
@@ -1095,6 +1097,18 @@ extern "C" {
             int                   n_groups,
             float                 eps);
 
+    // l2 normalize along rows
+    // used in rwkv v7
+    GGML_API struct ggml_tensor * ggml_l2_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
     // a - x
     // b - dy
     GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -1890,6 +1904,16 @@ extern "C" {
             struct ggml_tensor  * state,
             float scale);
 
+    GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * state);
+
     // custom operators
 
     typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
 
@@ -60,6 +60,7 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
+    struct llama_kv_cache;
 
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -106,6 +107,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
         LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
+        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
     };
 
     enum llama_rope_type {
@@ -469,7 +471,8 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -586,7 +589,7 @@ extern "C" {
     // KV cache
     //
 
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
 
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
@@ -641,21 +644,27 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_self_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_self_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
+    LLAMA_API void llama_kv_self_clear(
             struct llama_context * ctx);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_self_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -665,25 +674,25 @@ extern "C" {
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    LLAMA_API void llama_kv_self_seq_cp(
             struct llama_context * ctx,
                     llama_seq_id   seq_id_src,
                     llama_seq_id   seq_id_dst,
                        llama_pos   p0,
                        llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_kv_self_seq_keep(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_self_seq_add(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -693,35 +702,87 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    LLAMA_API void llama_kv_self_seq_div(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
                        llama_pos   p1,
                              int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+                     llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+
+    // Check if the context supports KV cache shifting
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
 
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
 
-    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
 
     //
     // State / sessions
@@ -885,6 +946,10 @@ extern "C" {
     // If set to true, the model will only attend to the past tokens
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
 
+    // Set whether the model is in warmup mode or not
+    // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"FileVersion": 3,`
`3`	`3`	`"Version": 1,`
`4`		`- "VersionName": "0.9.2",`
	`4`	`+ "VersionName": "0.9.3",`
`5`	`5`	`"FriendlyName": "Llama",`
`6`	`6`	`"Description": "Llama.cpp plugin for large language model (LLM) inference.",`
`7`	`7`	`"Category": "LLM",`