SciSharp
diff --git a/‎LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
Lines changed: 1 addition & 1 deletion b/‎LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
Lines changed: 1 addition & 1 deletion b/‎LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama/Abstractions/IContextParams.cs
Lines changed: 14 additions & 2 deletions b/‎LLama/Abstractions/IContextParams.cs
Lines changed: 14 additions & 2 deletions
diff --git a/‎LLama/Batched/Conversation.cs
Lines changed: 6 additions & 6 deletions b/‎LLama/Batched/Conversation.cs
Lines changed: 6 additions & 6 deletions
diff --git a/‎LLama/ChatSession.cs
Lines changed: 1 addition & 1 deletion b/‎LLama/ChatSession.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama/Extensions/IContextParamsExtensions.cs
Lines changed: 5 additions & 0 deletions b/‎LLama/Extensions/IContextParamsExtensions.cs
Lines changed: 5 additions & 0 deletions
diff --git a/‎LLama/LLamaExecutorBase.cs
Lines changed: 7 additions & 6 deletions b/‎LLama/LLamaExecutorBase.cs
Lines changed: 7 additions & 6 deletions
diff --git a/‎LLama/LLamaReranker.cs
Lines changed: 4 additions & 4 deletions b/‎LLama/LLamaReranker.cs
Lines changed: 4 additions & 4 deletions
diff --git a/‎LLama/LLamaStatelessExecutor.cs
Lines changed: 2 additions & 2 deletions b/‎LLama/LLamaStatelessExecutor.cs
Lines changed: 2 additions & 2 deletions
diff --git a/‎LLama/Native/DecodeResult.cs
Lines changed: 3 additions & 3 deletions b/‎LLama/Native/DecodeResult.cs
Lines changed: 3 additions & 3 deletions
@@ -119,7 +119,7 @@ public void GlobalCleanup()
         {
             if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
             {
-                Executor.Context.NativeHandle.KvCacheClear();
+                Executor.Context.NativeHandle.MemoryClear();
             }
         }
 
 
@@ -79,7 +79,7 @@ public static async Task Run()
                     // When the prompt contains images we clear KV_CACHE to restart conversation
                     // See:
                     // https://github.com/ggerganov/llama.cpp/discussions/3620
-                    ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
+                    ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
 
                     int index = 0;
                     foreach (var path in imagePathsWithCurlyBraces)
 
@@ -109,8 +109,7 @@ public interface IContextParams
     bool FlashAttention { get; }
 
     /// <summary>
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
+    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
     float? DefragThreshold { get; }
 
@@ -123,4 +122,17 @@ public interface IContextParams
     /// Attention type to use for embeddings
     /// </summary>
     LLamaAttentionType AttentionType { get; }
+
+    /// <summary>
+    /// Offload host tensor operations to device
+    /// </summary>
+    bool? OpOffload { get; }
+
+    /// <summary>
+    /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    /// </summary>
+    /// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
+    ///       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+    /// </remarks>
+    bool? SwaFull { get; }
 }
@@ -84,7 +84,7 @@ public void Dispose()
         _disposed = true;
 
         // Remove this conversation from the KV cache
-        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
+        Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
 
         // Prevent finalizer from running
         GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@ public Conversation Fork()
         _forked = true;
 
         // Assign tokens to the new sequence
-        Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
+        Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
 
         return c;
     }
@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
         /// <param name="end">End position (exclusive)</param>
         public void Remove(LLamaPos start, LLamaPos end)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
 
         /// <summary>
@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
                 return;
 
             var end = start.Value + count;
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
         #endregion
 
@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
         /// <param name="delta">Amount to add on to each token position</param>
         public void Add(LLamaPos start, LLamaPos end, int delta)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
         }
         #endregion
 
@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
             if (divisor <= 0)
                 throw new ArgumentOutOfRangeException(nameof(divisor));
 
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
         }
         #endregion
     }
 
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
         }
         if (state.ContextState is null)
         {
-            Executor.Context.NativeHandle.KvCacheClear();
+            Executor.Context.NativeHandle.MemoryClear();
         }
         else
         {
 
@@ -55,6 +55,11 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
+
+            if (@params.SwaFull.HasValue)
+                result.swa_full = @params.SwaFull.Value;
+            if (@params.OpOffload.HasValue)
+                result.op_offload = @params.OpOffload.Value;
         }
 
         private static int Threads(int? value)
 
@@ -128,15 +128,16 @@ public StatefulExecutorBase WithSessionFile(string filename)
             }
             if (File.Exists(filename))
             {
-                _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
+                _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);
+
                 var session_tokens = new LLamaToken[Context.ContextSize];
                 if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
                 {
                     _logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");
                     throw new RuntimeError($"Failed to load session file {_pathSession}");
                 }
                 _session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
-                _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");
+                _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);
             }
             else
             {
@@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
             // if we run out of context:
             // - take the tokensToKeep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
-            int n_left = _pastTokensCount - tokensToKeep;
-            int n_discard = n_left / 2;
+            var n_left = _pastTokensCount - tokensToKeep;
+            var n_discard = n_left / 2;
 
-            NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
-            NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+            Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
+            Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
 
             _pastTokensCount -= n_discard;
             // stop saving session if we run out of context
 
@@ -114,7 +114,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
             batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
 
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -144,7 +144,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
 
         var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return (normalize ? Sigmoid(score) : score, tokens.Length);
     }
@@ -155,7 +155,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
         var seqNum = logicCap.Value + 1;
         List<float> scores = new List<float>(seqNum);
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -189,7 +189,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
             scores.Add(normalize ? Sigmoid(score) : score);
         }
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return scores;
     }
 
@@ -158,8 +158,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
-                    NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
+                    Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensKeep, tokensKeep + n_discard);
+                    Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }
 
@@ -1,14 +1,14 @@
-namespace LLama.Native;
+namespace LLama.Native;
 
 /// <summary>
 /// Return codes from llama_decode
 /// </summary>
 public enum DecodeResult
 {
     /// <summary>
-    /// An unspecified error
+    /// Input batch was invalid
     /// </summary>
-    Error = -1,
+    InvalidInputBatch = -1,
 
     /// <summary>
     /// Ok.
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ public void GlobalCleanup()`
`119`	`119`	`{`
`120`	`120`	if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
`121`	`121`	`{`
`122`		`- Executor.Context.NativeHandle.KvCacheClear();`
	`122`	`+ Executor.Context.NativeHandle.MemoryClear();`
`123`	`123`	`}`
`124`	`124`	`}`
`125`	`125`
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ public void Dispose()`
`84`	`84`	`_disposed = true;`
`85`	`85`
`86`	`86`	`// Remove this conversation from the KV cache`
`87`		`- Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);`
	`87`	`+ Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);`
`88`	`88`
`89`	`89`	`// Prevent finalizer from running`
`90`	`90`	`GC.SuppressFinalize(this);`
`@@ -129,7 +129,7 @@ public Conversation Fork()`
`129`	`129`	`_forked = true;`
`130`	`130`
`131`	`131`	`// Assign tokens to the new sequence`
`132`		`- Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);`
	`132`	`+ Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);`
`133`	`133`
`134`	`134`	`return c;`
`135`	`135`	`}`
`@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)`
`406`	`406`	`/// <param name="end">End position (exclusive)</param>`
`407`	`407`	`public void Remove(LLamaPos start, LLamaPos end)`
`408`	`408`	`{`
`409`		`- _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);`
	`409`	`+ _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);`
`410`	`410`	`}`
`411`	`411`
`412`	`412`	`/// <summary>`
`@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)`
`420`	`420`	`return;`
`421`	`421`
`422`	`422`	`var end = start.Value + count;`
`423`		`- _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);`
	`423`	`+ _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);`
`424`	`424`	`}`
`425`	`425`	`#endregion`
`426`	`426`
`@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)`
`435`	`435`	`/// <param name="delta">Amount to add on to each token position</param>`
`436`	`436`	`public void Add(LLamaPos start, LLamaPos end, int delta)`
`437`	`437`	`{`
`438`		`- _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);`
	`438`	`+ _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);`
`439`	`439`	`}`
`440`	`440`	`#endregion`
`441`	`441`
`@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)`
`452`	`452`	`if (divisor <= 0)`
`453`	`453`	`throw new ArgumentOutOfRangeException(nameof(divisor));`
`454`	`454`
`455`		`- _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);`
	`455`	`+ _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);`
`456`	`456`	`}`
`457`	`457`	`#endregion`
`458`	`458`	`}`
Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)`
`199`	`199`	`}`
`200`	`200`	`if (state.ContextState is null)`
`201`	`201`	`{`
`202`		`- Executor.Context.NativeHandle.KvCacheClear();`
	`202`	`+ Executor.Context.NativeHandle.MemoryClear();`
`203`	`203`	`}`
`204`	`204`	`else`
`205`	`205`	`{`
Original file line number	Diff line number	Diff line change
`@@ -128,15 +128,16 @@ public StatefulExecutorBase WithSessionFile(string filename)`
`128`	`128`	`}`
`129`	`129`	`if (File.Exists(filename))`
`130`	`130`	`{`
`131`		`- _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");`
	`131`	`+ _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);`
	`132`	`+`
`132`	`133`	`var session_tokens = new LLamaToken[Context.ContextSize];`
`133`	`134`	`if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))`
`134`	`135`	`{`
`135`	`136`	`_logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");`
`136`	`137`	`throw new RuntimeError($"Failed to load session file {_pathSession}");`
`137`	`138`	`}`
`138`	`139`	`_session_tokens = session_tokens.Take((int)n_token_count_out).ToList();`
`139`		`- _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");`
	`140`	`+ _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);`
`140`	`141`	`}`
`141`	`142`	`else`
`142`	`143`	`{`
`@@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)`
`190`	`191`	`// if we run out of context:`
`191`	`192`	`// - take the tokensToKeep first tokens from the original prompt (via n_past)`
`192`	`193`	`// - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches`
`193`		`- int n_left = _pastTokensCount - tokensToKeep;`
`194`		`- int n_discard = n_left / 2;`
	`194`	`+ var n_left = _pastTokensCount - tokensToKeep;`
	`195`	`+ var n_discard = n_left / 2;`
`195`	`196`
`196`		`- NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);`
`197`		`- NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);`
	`197`	`+ Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);`
	`198`	`+ Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);`
`198`	`199`
`199`	`200`	`_pastTokensCount -= n_discard;`
`200`	`201`	`// stop saving session if we run out of context`
Original file line number	Diff line number	Diff line change
`@@ -158,8 +158,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams`
`158`	`158`	`var n_left = n_past - tokensKeep;`
`159`	`159`	`var n_discard = n_left / 2;`
`160`	`160`
`161`		`- NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);`
`162`		`- NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);`
	`161`	`+ Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensKeep, tokensKeep + n_discard);`
	`162`	`+ Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);`
`163`	`163`
`164`	`164`	`n_past -= n_discard;`
`165`	`165`	`}`