Skip to content

Commit 8ff2e89

Browse files
authored
Merge pull request #1225 from martindevans/update_july_2025
Update july 2025
2 parents 1ebe906 + 4341e83 commit 8ff2e89

36 files changed

+506
-828
lines changed

LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public void GlobalCleanup()
119119
{
120120
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
121121
{
122-
Executor.Context.NativeHandle.KvCacheClear();
122+
Executor.Context.NativeHandle.MemoryClear();
123123
}
124124
}
125125

LLama.Examples/Examples/BatchedExecutorSimple.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>
9797

9898
// A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
9999
// a bug in LLamaSharp, llama.cpp or a hardware error.
100-
if (decodeResult == DecodeResult.Error)
101-
throw new Exception("Unknown error occurred while inferring.");
100+
if (decodeResult != DecodeResult.Ok)
101+
throw new Exception($"Error occurred while inferring: {decodeResult}");
102102

103103
// After inference all of the conversations must be sampled before running inference again.
104104
foreach (var conversationData in conversations)

LLama.Examples/Examples/LlavaInteractiveModeExecute.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public static async Task Run()
7979
// When the prompt contains images we clear KV_CACHE to restart conversation
8080
// See:
8181
// https://github.com/ggerganov/llama.cpp/discussions/3620
82-
ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
82+
ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
8383

8484
int index = 0;
8585
foreach (var path in imagePathsWithCurlyBraces)

LLama.Unittest/LLavaWeightsTests.cs

Lines changed: 0 additions & 54 deletions
This file was deleted.

LLama.Web/Common/ModelOptions.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,15 @@ public class ModelOptions
110110
/// <inheritdoc />
111111
public bool VocabOnly { get; set; }
112112

113+
/// <inheritdoc />
114+
public bool? OpOffload { get; set; }
115+
116+
/// <inheritdoc />
117+
public bool? SwaFull { get; set; }
118+
119+
/// <inheritdoc />
120+
public bool? KVUnified { get; set; }
121+
113122
/// <inheritdoc />
114123
public float? DefragThreshold { get; set; }
115124

LLama/Abstractions/IContextParams.cs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ public interface IContextParams
109109
bool FlashAttention { get; }
110110

111111
/// <summary>
112-
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
113-
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
112+
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
114113
/// </summary>
115114
float? DefragThreshold { get; }
116115

@@ -123,4 +122,25 @@ public interface IContextParams
123122
/// Attention type to use for embeddings
124123
/// </summary>
125124
LLamaAttentionType AttentionType { get; }
125+
126+
/// <summary>
127+
/// Offload host tensor operations to device
128+
/// </summary>
129+
bool? OpOffload { get; }
130+
131+
/// <summary>
132+
/// use a unified buffer across the input sequences when computing the attention.
133+
/// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
134+
/// <br />
135+
/// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
136+
/// </summary>
137+
bool? KVUnified { get; }
138+
139+
/// <summary>
140+
/// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
141+
/// </summary>
142+
/// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
143+
/// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
144+
/// </remarks>
145+
bool? SwaFull { get; }
126146
}

LLama/Batched/Conversation.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public void Dispose()
8484
_disposed = true;
8585

8686
// Remove this conversation from the KV cache
87-
Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
87+
Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
8888

8989
// Prevent finalizer from running
9090
GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@ public Conversation Fork()
129129
_forked = true;
130130

131131
// Assign tokens to the new sequence
132-
Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
132+
Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
133133

134134
return c;
135135
}
@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
406406
/// <param name="end">End position (exclusive)</param>
407407
public void Remove(LLamaPos start, LLamaPos end)
408408
{
409-
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
409+
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
410410
}
411411

412412
/// <summary>
@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
420420
return;
421421

422422
var end = start.Value + count;
423-
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
423+
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
424424
}
425425
#endregion
426426

@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
435435
/// <param name="delta">Amount to add on to each token position</param>
436436
public void Add(LLamaPos start, LLamaPos end, int delta)
437437
{
438-
_conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
438+
_conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
439439
}
440440
#endregion
441441

@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
452452
if (divisor <= 0)
453453
throw new ArgumentOutOfRangeException(nameof(divisor));
454454

455-
_conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
455+
_conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
456456
}
457457
#endregion
458458
}

LLama/ChatSession.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
199199
}
200200
if (state.ContextState is null)
201201
{
202-
Executor.Context.NativeHandle.KvCacheClear();
202+
Executor.Context.NativeHandle.MemoryClear();
203203
}
204204
else
205205
{

LLama/Common/ModelParams.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,15 @@ public record ModelParams
112112
/// <inheritdoc />
113113
public bool VocabOnly { get; set; }
114114

115+
/// <inheritdoc />
116+
public bool? OpOffload { get; set; }
117+
118+
/// <inheritdoc />
119+
public bool? SwaFull { get; set; }
120+
121+
/// <inheritdoc />
122+
public bool? KVUnified { get; set; }
123+
115124
/// <summary>
116125
/// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
117126
/// </summary>

LLama/Extensions/IContextParamsExtensions.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
5555

5656
result.n_threads = Threads(@params.Threads);
5757
result.n_threads_batch = Threads(@params.BatchThreads);
58+
59+
if (@params.SwaFull.HasValue)
60+
result.swa_full = @params.SwaFull.Value;
61+
if (@params.OpOffload.HasValue)
62+
result.op_offload = @params.OpOffload.Value;
63+
if (@params.KVUnified.HasValue)
64+
result.kv_unified = @params.KVUnified.Value;
5865
}
5966

6067
private static int Threads(int? value)

0 commit comments

Comments
 (0)