diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
index 63114120d..f8ef7d5aa 100644
--- a/LLama.Examples/Program.cs
+++ b/LLama.Examples/Program.cs
@@ -1,6 +1,5 @@
-using LLama.Native;
+using LLama.Native;
using Spectre.Console;
-using System.Runtime.InteropServices;
AnsiConsole.MarkupLineInterpolated(
$"""
@@ -18,7 +17,7 @@ __ __ ____ __
""");
// Configure logging. Change this to `true` to see log messages from llama.cpp
-var showLLamaCppLogs = false;
+var showLLamaCppLogs = true;
NativeLibraryConfig
.All
.WithLogCallback((level, message) =>
@@ -31,8 +30,7 @@ __ __ ____ __
NativeLibraryConfig
.All
.WithCuda()
- //.WithAutoDownload() // An experimental feature
- .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
+ .WithVulkan();
// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 7f9ae1e4d..6efd44f7b 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
var @params = new ModelParams(config.ModelPath)
{
- ContextSize = config.ContextSize ?? 2048,
+ ContextSize = config.ContextSize,
GpuLayerCount = config.GpuLayerCount ?? 20,
Embeddings = true,
- MainGpu = config.MainGpu,
- SplitMode = config.SplitMode,
PoolingType = LLamaPoolingType.Mean,
};
_weights = LLamaWeights.LoadFromFile(@params);
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index adfc89317..3fc96db9a 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
{
ContextSize = config.ContextSize ?? 2048,
GpuLayerCount = config.GpuLayerCount ?? 20,
- MainGpu = config.MainGpu,
- SplitMode = config.SplitMode
};
_weights = LLamaWeights.LoadFromFile(parameters);
_context = _weights.CreateContext(parameters);
diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
index 91161b72c..5c7b4213d 100644
--- a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
+++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -1,21 +1,15 @@
-using LLama.Common;
using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
using Xunit.Abstractions;
namespace LLama.Unittest.KernelMemory
{
- public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+ public class LLamaSharpTextEmbeddingGeneratorTests
+ : ITextTokenizerTests, IDisposable
{
private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
- public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+ public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
+ : base(testOutputHelper)
{
_embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
index 02001f8cf..d21d7f959 100644
--- a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
+++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -1,25 +1,15 @@
-using LLama.Common;
using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-using System.Reflection.Emit;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
using Xunit.Abstractions;
-using Xunit.Sdk;
-using static System.Net.Mime.MediaTypeNames;
namespace LLama.Unittest.KernelMemory
{
- public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+ public class LlamaSharpTextGeneratorTests
+ : ITextTokenizerTests, IDisposable
{
private readonly LlamaSharpTextGenerator _textGenerator;
- public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+ public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
+ : base(testOutputHelper)
{
_textGenerator = new LlamaSharpTextGenerator(_lsConfig);
diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
index f322bc250..bae7e3dea 100644
--- a/LLama.Unittest/SamplingTests.cs
+++ b/LLama.Unittest/SamplingTests.cs
@@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());
chain.AddPenalties(
- vocabSize: context.VocabCount,
- eos: context.ModelHandle.Tokens.EOS,
- newline: context.ModelHandle.Tokens.Newline ?? 0,
- penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
- penalizeNewline: false, ignoreEOS: false
+ penaltyCount: 60, repeat: 1, freq: 0, presence: 0
);
if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 4e002c93f..a67a11a96 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -24,7 +24,7 @@ public class ModelOptions
public int MainGpu { get; set; } = 0;
///
- public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+ public GPUSplitMode? SplitMode { get; set; }
///
public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
///
public TensorSplitsCollection TensorSplits { get; set; } = new();
+ ///
+ public bool CheckTensors { get; }
+
///
public List MetadataOverrides { get; } = new();
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index 7dc28f671..cbbacafe5 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -36,7 +36,7 @@ public interface IModelParams
///
/// How to split the model across multiple GPUs
///
- GPUSplitMode SplitMode { get; }
+ GPUSplitMode? SplitMode { get; }
///
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
///
bool VocabOnly { get; }
+ ///
+ /// Validate model tensor data before loading
+ ///
+ bool CheckTensors { get; }
+
///
/// Override specific metadata items in the model
///
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index b276ed73a..7e4b1a967 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -19,7 +19,7 @@ public record ModelParams
public int MainGpu { get; set; } = 0;
///
- public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+ public GPUSplitMode? SplitMode { get; set; }
///
public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
///
public TensorSplitsCollection TensorSplits { get; set; } = new();
+ ///
+ public bool CheckTensors { get; }
+
///
public List MetadataOverrides { get; set; } = new();
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
index 523ec737a..588564e33 100644
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -1,4 +1,4 @@
-using System.IO;
+using System.IO;
using System;
using System.Text;
using LLama.Abstractions;
@@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
result = LLamaModelParams.Default();
result.main_gpu = @params.MainGpu;
- result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
+ if (@params.SplitMode.HasValue)
+ result.split_mode = @params.SplitMode.Value;
+
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
result.vocab_only = @params.VocabOnly;
+ result.check_tensors = @params.CheckTensors;
unsafe
{
diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
index 19c8d33df..e38ccf98d 100644
--- a/LLama/Extensions/LLamaExecutorExtensions.cs
+++ b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -147,7 +147,7 @@ private string CreatePrompt(IList messages)
PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
- RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
+ PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
index 23f0f8b4e..9e90b732e 100644
--- a/LLama/LLamaQuantizer.cs
+++ b/LLama/LLamaQuantizer.cs
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
case LLamaFtype.MOSTLY_IQ3_S:
case LLamaFtype.MOSTLY_IQ3_M:
- case LLamaFtype.MOSTLY_Q4_0_4_4:
- case LLamaFtype.MOSTLY_Q4_0_4_8:
- case LLamaFtype.MOSTLY_Q4_0_8_8:
return true;
case LLamaFtype.GUESSED:
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index 6466a1204..76292aaf5 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -4,14 +4,24 @@
-
+
PreserveNewest
runtimes/win-x64/native/noavx/llama.dll
-
+
PreserveNewest
runtimes/win-x64/native/noavx/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/noavx/ggml-base.dll
+
+
+ PreserveNewest
+ runtimes/win-x64/native/noavx/ggml-cpu.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/avx/llama.dll
@@ -20,55 +30,124 @@
PreserveNewest
runtimes/win-x64/native/avx/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/avx/ggml-base.dll
+
+
+ PreserveNewest
+ runtimes/win-x64/native/avx/ggml-cpu.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/avx2/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/avx2/ggml-base.dll
+
PreserveNewest
runtimes/win-x64/native/avx2/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/avx2/ggml-cpu.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/avx512/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/avx512/ggml-base.dll
+
PreserveNewest
runtimes/win-x64/native/avx512/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/avx512/ggml-cpu.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/cuda11/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/cuda11/ggml-base.dll
+
PreserveNewest
runtimes/win-x64/native/cuda11/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/cuda11/ggml-cuda.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/cuda12/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/cuda12/ggml-base.dll
+
PreserveNewest
runtimes/win-x64/native/cuda12/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/cuda12/ggml-cuda.dll
+
+
+
PreserveNewest
runtimes/win-x64/native/vulkan/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/vulkan/ggml-base.dll
+
PreserveNewest
runtimes/win-x64/native/vulkan/ggml.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/vulkan/ggml-vulkan.dll
+
+
-
+
PreserveNewest
runtimes/linux-x64/native/noavx/libllama.so
-
+
PreserveNewest
runtimes/linux-x64/native/noavx/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/noavx/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/noavx/libggml-cpu.so
+
+
+
PreserveNewest
runtimes/linux-x64/native/avx/libllama.so
@@ -77,6 +156,17 @@
PreserveNewest
runtimes/linux-x64/native/avx/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx/libggml-cpu.so
+
+
+
+
PreserveNewest
runtimes/linux-x64/native/avx2/libllama.so
@@ -85,6 +175,15 @@
PreserveNewest
runtimes/linux-x64/native/avx2/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx2/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx2/libggml-cpu.so
+
+
PreserveNewest
runtimes/linux-x64/native/avx512/libllama.so
@@ -93,6 +192,15 @@
PreserveNewest
runtimes/linux-x64/native/avx512/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx512/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/avx512/libggml-cpu.so
+
+
PreserveNewest
runtimes/linux-x64/native/cuda11/libllama.so
@@ -101,6 +209,16 @@
PreserveNewest
runtimes/linux-x64/native/cuda11/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/cuda11/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/cuda11/libggml-cuda.so
+
+
+
PreserveNewest
runtimes/linux-x64/native/cuda12/libllama.so
@@ -109,6 +227,16 @@
PreserveNewest
runtimes/linux-x64/native/cuda12/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/cuda12/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/cuda12/libggml-cuda.so
+
+
+
PreserveNewest
runtimes/linux-x64/native/vulkan/libllama.so
@@ -117,7 +245,32 @@
PreserveNewest
runtimes/linux-x64/native/vulkan/libggml.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/vulkan/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-x64/native/vulkan/libggml-vulkan.so
+
+
+
+ PreserveNewest
+ runtimes/osx-arm64/native/libggml-base.dylib
+
+
+ PreserveNewest
+ runtimes/osx-arm64/native/libggml-cpu.dylib
+
+
+ PreserveNewest
+ runtimes/osx-arm64/native/libggml-metal.dylib
+
+
+ PreserveNewest
+ runtimes/osx-arm64/native/libggml-blas.dylib
+
PreserveNewest
runtimes/osx-arm64/native/libggml.dylib
@@ -134,7 +287,19 @@
PreserveNewest
runtimes/osx-arm64/native/ggml-metal.metal
-
+
+
+ PreserveNewest
+ runtimes/osx-x64/native/libggml-base.dylib
+
+
+ PreserveNewest
+ runtimes/osx-x64/native/libggml-cpu.dylib
+
+
+ PreserveNewest
+ runtimes/osx-x64/native/libggml-blas.dylib
+
PreserveNewest
runtimes/osx-x64/native/libggml.dylib
@@ -148,6 +313,18 @@
runtimes/osx-x64/native/libllava_shared.dylib
+
+ PreserveNewest
+ runtimes/osx-x64/native/rosetta2/libggml-base.dylib
+
+
+ PreserveNewest
+ runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib
+
+
+ PreserveNewest
+ runtimes/osx-x64/native/rosetta2/libggml-blas.dylib
+
PreserveNewest
runtimes/osx-x64/native/rosetta2/libggml.dylib
@@ -161,7 +338,7 @@
runtimes/osx-x64/native/rosetta2/libllava_shared.dylib
-
+
PreserveNewest
runtimes/win-x64/native/noavx/llava_shared.dll
@@ -190,7 +367,7 @@
runtimes/win-x64/native/vulkan/llava_shared.dll
-
+
PreserveNewest
runtimes/linux-x64/native/noavx/libllava_shared.so
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 735fa81a5..b2d81711e 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
- 958367bf530d943a90
+ 0827b2c1da-v6
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index a4b9920dc..393bebc3f 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -23,14 +23,14 @@ public class StatelessExecutor
private readonly ILogger? _logger;
private readonly LLamaBatch _batch;
- // LLava Section
+ ///
public bool IsMultiModal => false;
///
- public LLavaWeights? ClipModel { get; }
+ public LLavaWeights? ClipModel => default;
///
- public List Images { get; set; }
+ public List Images { get; }
///
/// The context used by the executor when running the inference.
@@ -68,7 +68,7 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams
Context = context;
// Reset the sampling pipeline (if there is one)
- inferenceParams?.SamplingPipeline?.Reset();
+ inferenceParams?.SamplingPipeline.Reset();
// Sanity check inference params
inferenceParams ??= new InferenceParams();
@@ -134,8 +134,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams
var n_left = n_past - tokensKeep;
var n_discard = n_left / 2;
- NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
- NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
+ NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
+ NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
n_past -= n_discard;
}
diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
index 54fa095c1..27ee7ae49 100644
--- a/LLama/Native/GPUSplitMode.cs
+++ b/LLama/Native/GPUSplitMode.cs
@@ -17,7 +17,7 @@ public enum GPUSplitMode
Layer = 1,
///
- /// split rows across GPUs
+ /// split layers and KV across GPUs, use tensor parallelism if supported
///
Row = 2,
}
\ No newline at end of file
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 6970a4728..705f8032e 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -177,20 +177,20 @@ public enum LLamaFtype
///
MOSTLY_BF16 = 32,
- ///
- /// except 1d tensors
- ///
- MOSTLY_Q4_0_4_4 = 33,
+ /////
+ ///// except 1d tensors (no longer supported by llama.cpp)
+ /////
+ //MOSTLY_Q4_0_4_4 = 33,
- ///
- /// except 1d tensors
- ///
- MOSTLY_Q4_0_4_8 = 34,
+ /////
+ ///// except 1d tensors (no longer supported by llama.cpp)
+ /////
+ //MOSTLY_Q4_0_4_8 = 34,
- ///
- /// except 1d tensors
- ///
- MOSTLY_Q4_0_8_8 = 35,
+ /////
+ ///// except 1d tensors (no longer supported by llama.cpp)
+ /////
+ //MOSTLY_Q4_0_8_8 = 35,
///
/// except 1d tensors
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index c0437d9db..e16e3263e 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -8,6 +8,12 @@ namespace LLama.Native
[StructLayout(LayoutKind.Sequential)]
public unsafe struct LLamaModelParams
{
+ ///
+ /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+ /// todo: add support for llama_model_params.devices
+ ///
+ private IntPtr devices;
+
///
/// // number of layers to store in VRAM
///
@@ -19,19 +25,19 @@ public unsafe struct LLamaModelParams
public GPUSplitMode split_mode;
///
- /// the GPU that is used for scratch and small tensors
+ /// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
///
public int main_gpu;
///
/// how to split layers across multiple GPUs (size: )
///
- public float* tensor_split;
-
- ///
- /// comma separated list of RPC servers to use for offloading
+ public float* tensor_split;
+
+ ///
+ /// comma separated list of RPC servers to use for offloading
///
- public byte* rpc_servers;
+ public byte* rpc_servers;
///
/// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
diff --git a/LLama/Native/LLamaRopeType.cs b/LLama/Native/LLamaRopeType.cs
index ebad9e77b..3f1188112 100644
--- a/LLama/Native/LLamaRopeType.cs
+++ b/LLama/Native/LLamaRopeType.cs
@@ -9,4 +9,6 @@ public enum LLamaRopeType
None = -1,
Norm = 0,
NEOX = 2,//GGML_ROPE_TYPE_NEOX,
+ //todo:LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
+ //todo:LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
}
\ No newline at end of file
diff --git a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
index 6f5ad35fe..36ab0c0c8 100644
--- a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
+++ b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
@@ -10,8 +10,6 @@ public class DefaultNativeLibrarySelectingPolicy: INativeLibrarySelectingPolicy
///
public IEnumerable Apply(NativeLibraryConfig.Description description, SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback)
{
- List results = new();
-
// Show the configuration we're working with
Log(description.ToString(), LLamaLogLevel.Info, logCallback);
@@ -24,12 +22,12 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip
{
if (description.UseCuda)
{
- yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.SkipCheck);
+ yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.AvxLevel, description.SkipCheck);
}
if (description.UseVulkan)
{
- yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.SkipCheck);
+ yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.AvxLevel, description.SkipCheck);
}
if((!description.UseCuda || !description.UseVulkan) || description.AllowFallback)
@@ -56,7 +54,7 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip
if(systemInfo.OSPlatform == OSPlatform.OSX || description.AllowFallback)
{
- yield return new NativeLibraryWithMacOrFallback(description.Library, description.SkipCheck);
+ yield return new NativeLibraryWithMacOrFallback(description.Library);
}
}
}
diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs
index 02e47b695..2bfa0554b 100644
--- a/LLama/Native/Load/NativeLibraryConfig.cs
+++ b/LLama/Native/Load/NativeLibraryConfig.cs
@@ -178,7 +178,7 @@ internal Description CheckAndGatherDescription()
_avxLevel,
_allowFallback,
_skipCheck,
- _searchDirectories.Concat(new[] { "./" }).ToArray()
+ _searchDirectories.Concat([ "./" ]).ToArray()
);
}
@@ -186,7 +186,7 @@ internal static string AvxLevelToString(AvxLevel level)
{
return level switch
{
- AvxLevel.None => string.Empty,
+ AvxLevel.None => "noavx",
AvxLevel.Avx => "avx",
AvxLevel.Avx2 => "avx2",
AvxLevel.Avx512 => "avx512",
diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs
index d5b014ce0..13e68be4d 100644
--- a/LLama/Native/Load/NativeLibraryUtils.cs
+++ b/LLama/Native/Load/NativeLibraryUtils.cs
@@ -45,33 +45,86 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib
{
Log($"Got relative library path '{path}' from local with {library.Metadata}, trying to load it...", LLamaLogLevel.Debug, config.LogCallback);
- // If we are on Linux / OSX, we need to manually load the GGML dependency
- if (systemInfo.OSPlatform == OSPlatform.Linux || systemInfo.OSPlatform == OSPlatform.OSX)
+ // After the llama.cpp binaries have been split up (PR #10256), we need to load the dependencies manually.
+ // It can't be done automatically on Windows, because the dependencies can be in different folders (for example, ggml-cuda.dll from the cuda12 folder, and ggml-cpu.dll from the avx2 folder)
+ // It can't be done automatically on Linux, because Linux uses the environment variable "LD_LIBRARY_PATH" to automatically load dependencies, and LD_LIBRARY_PATH can only be
+ // set before running LLamaSharp, but we only know which folders to search in when running LLamaSharp (decided by the NativeLibrary).
+
+ // Get the directory of the current runtime
+ string? currentRuntimeDirectory = Path.GetDirectoryName(path);
+
+ // If we failed to get the directory of the current runtime, log it and continue on to the next library
+ if (currentRuntimeDirectory == null)
{
- // Get the directory of the library
- string? libraryDirectory = Path.GetDirectoryName(path);
-
- if (libraryDirectory != null)
+ Log($"Failed to get the directory of the current runtime from path '{path}'", LLamaLogLevel.Error, config.LogCallback);
+ continue;
+ }
+
+ // List which will hold all paths to dependencies to load
+ var dependencyPaths = new List();
+
+ // We should always load ggml-base from the current runtime directory
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-base{ext}"));
+
+ // If the library has metadata, we can check if we need to load additional dependencies
+ if (library.Metadata != null)
+ {
+ if (systemInfo.OSPlatform == OSPlatform.OSX)
{
- // Construct the dependency (libggml) path
- string dependencyPath = Path.Combine(libraryDirectory, $"{libPrefix}ggml{ext}");
-
- // Try to load the dependency
- var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback);
+ // On OSX, we should load the CPU backend from the current directory
+
+ // ggml-cpu
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cpu{ext}"));
+
+ // ggml-metal (only supported on osx-arm64)
+ if (os == "osx-arm64")
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}"));
+
+ // ggml-blas (osx-x64, osx-x64-rosetta2 and osx-arm64 all have blas)
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-blas{ext}"));
+ }
+ else
+ {
+ // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory
+ // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us
+
+ // ggml-cpu
+ dependencyPaths.Add(Path.Combine(
+ $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}",
+ $"{libPrefix}ggml-cpu{ext}"
+ ));
+
+ // ggml-cuda
+ if (library.Metadata.UseCuda)
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}"));
+
+ // ggml-vulkan
+ if (library.Metadata.UseVulkan)
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}"));
+ }
+ }
+
+ // And finally, we can add ggml
+ dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml{ext}"));
+
+ // Now, we will loop through our dependencyPaths and try to load them one by one
+ foreach (var dependencyPath in dependencyPaths)
+ {
+ // Try to load the dependency
+ var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback);
- // If we successfully loaded the library, log it
- if (dependencyResult != IntPtr.Zero)
- {
- Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
- }
- else
- {
- Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
- }
+ // If we successfully loaded the library, log it
+ if (dependencyResult != IntPtr.Zero)
+ {
+ Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
+ }
+ else
+ {
+ Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
}
}
- // Try to load the library
+ // Try to load the main library
var result = TryLoad(path, description.SearchDirectories, config.LogCallback);
// If we successfully loaded the library, return the handle
diff --git a/LLama/Native/Load/NativeLibraryWithCuda.cs b/LLama/Native/Load/NativeLibraryWithCuda.cs
index 12da095dc..36dc4ca81 100644
--- a/LLama/Native/Load/NativeLibraryWithCuda.cs
+++ b/LLama/Native/Load/NativeLibraryWithCuda.cs
@@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata
///
///
///
+ ///
///
- public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, bool skipCheck)
+ public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck)
{
_majorCudaVersion = majorCudaVersion;
_libraryName = libraryName;
+ _avxLevel = avxLevel;
_skipCheck = skipCheck;
}
diff --git a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
index 6bcd55049..59754be03 100644
--- a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
+++ b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
@@ -1,5 +1,5 @@
-using LLama.Abstractions;
using System.Collections.Generic;
+using LLama.Abstractions;
namespace LLama.Native
{
@@ -7,39 +7,30 @@ namespace LLama.Native
///
/// A native library compiled on Mac, or fallbacks from all other libraries in the selection.
///
- public class NativeLibraryWithMacOrFallback : INativeLibrary
+ public class NativeLibraryWithMacOrFallback
+ : INativeLibrary
{
- private NativeLibraryName _libraryName;
- private bool _skipCheck;
+ private readonly NativeLibraryName _libraryName;
///
- public NativeLibraryMetadata? Metadata
- {
- get
- {
- return new NativeLibraryMetadata(_libraryName, false, false, AvxLevel.None);
- }
- }
+ public NativeLibraryMetadata Metadata => new(_libraryName, false, false, AvxLevel.None);
///
///
///
///
- ///
- public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName, bool skipCheck)
+ public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName)
{
_libraryName = libraryName;
- _skipCheck = skipCheck;
}
///
public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback)
{
- var path = GetPath(systemInfo, AvxLevel.None, logCallback);
- return path is null ?[] : [path];
+ yield return GetPath(systemInfo);
}
- private string? GetPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback)
+ private string GetPath(SystemInfo systemInfo)
{
NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix);
string relativePath;
@@ -50,11 +41,7 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL
}
else
{
- var avxStr = NativeLibraryConfig.AvxLevelToString(AvxLevel.None);
- if (!string.IsNullOrEmpty(avxStr))
- avxStr += "/";
-
- relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}";
+ relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}";
}
return relativePath;
diff --git a/LLama/Native/Load/NativeLibraryWithVulkan.cs b/LLama/Native/Load/NativeLibraryWithVulkan.cs
index fe4eef01e..c3fe94de3 100644
--- a/LLama/Native/Load/NativeLibraryWithVulkan.cs
+++ b/LLama/Native/Load/NativeLibraryWithVulkan.cs
@@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata
///
///
///
+ ///
///
- public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, bool skipCheck)
+ public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck)
{
_vulkanVersion = vulkanVersion;
_libraryName = libraryName;
+ _avxLevel = avxLevel;
_skipCheck = skipCheck;
}
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 041cc0dd5..0d6bc1984 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -172,6 +172,15 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model,
static extern int internal_llama_chat_apply_template(IntPtr model, byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length);
}
+ ///
+ /// Get list of built-in chat templates
+ ///
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern unsafe int llama_chat_builtin_templates(char** output, nuint len);
+
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llama_add_bos_token(SafeLlamaModelHandle model);
diff --git a/LLama/Native/RopeScalingType.cs b/LLama/Native/RopeScalingType.cs
index 8d4552b80..61ae82942 100644
--- a/LLama/Native/RopeScalingType.cs
+++ b/LLama/Native/RopeScalingType.cs
@@ -1,4 +1,4 @@
-namespace LLama.Native
+namespace LLama.Native
{
///
/// RoPE scaling type.
@@ -26,5 +26,10 @@ public enum RopeScalingType
/// YaRN scaling: https://arxiv.org/pdf/2309.00071.pdf
///
Yarn = 2,
+
+ ///
+ /// LongRope scaling
+ ///
+ LongRope = 3,
}
}
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 450f4998a..19187ded9 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -333,6 +333,14 @@ static SafeLLamaContextHandle()
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx);
+ ///
+ /// Check if the context supports KV cache shifting
+ ///
+ ///
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern bool llama_kv_cache_can_shift(SafeLLamaContextHandle ctx);
+
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx);
@@ -566,7 +574,7 @@ public void Synchronize()
/// internally for later use by the decoder cross-attention layers.
///
///
- /// 0 = success
< 0 = error
+ /// 0 = success
< 0 = error (the KV cache state is restored to the state before this call)
public DecodeResult Encode(LLamaBatch batch)
{
if (batch.TokenCount == 0)
@@ -583,7 +591,7 @@ public DecodeResult Encode(LLamaBatch batch)
/// Positive return values does not mean a fatal error, but rather a warning:
/// - 0: success
/// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
+ /// - < 0: error (the KV cache state is restored to the state before this call)
///
public DecodeResult Decode(LLamaBatch batch)
{
@@ -746,6 +754,11 @@ public void ResetTimings()
#endregion
#region KV Cache Management
+ ///
+ /// Check if the context supports KV cache shifting
+ ///
+ public bool KvCacheCanShift => llama_kv_cache_can_shift(this);
+
///
/// Apply KV cache updates (such as K-shifts, defragmentation, etc.)
///
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
index ef6a7ae30..9099c2f32 100644
--- a/LLama/Native/SafeLLamaSamplerHandle.cs
+++ b/LLama/Native/SafeLLamaSamplerHandle.cs
@@ -1,5 +1,5 @@
using System;
-using System.Runtime.CompilerServices;
+using System.Collections.Generic;
using System.Text;
namespace LLama.Native;
@@ -410,40 +410,94 @@ public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root)
}
///
- /// Create a sampler that applies various repetition penalties
+ /// Create a sampler that applies various repetition penalties.
+ ///
+ /// Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
///
- /// Vocab size
- /// EOS token (if this model has one)
- /// Newline token
/// How many tokens of history to consider when calculating penalties
/// Repetition penalty
/// Frequency penalty
/// Presence penalty
- /// Whether or not to penalize the newline token
- /// Whether or not to ignore EOS token
///
- public void AddPenalties(
- int vocabSize, LLamaToken? eos, LLamaToken newline, int penaltyCount, float repeat, float freq, float presence, bool penalizeNewline, bool ignoreEOS
- )
+ public void AddPenalties(int penaltyCount, float repeat, float freq, float presence)
{
- llama_sampler_chain_add(this, llama_sampler_init_penalties(vocabSize, eos ?? LLamaToken.InvalidToken, newline, penaltyCount, repeat, freq, presence, penalizeNewline, ignoreEOS));
+ llama_sampler_chain_add(
+ this,
+ llama_sampler_init_penalties(
+ penaltyCount,
+ repeat,
+ freq,
+ presence
+ )
+ );
// ReSharper disable InconsistentNaming
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_penalties(
- int n_vocab, // llama_n_vocab()
- LLamaToken special_eos_id, // llama_token_eos()
- LLamaToken linefeed_id, // llama_token_nl()
- int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
- float penalty_repeat, // 1.0 = disabled
- float penalty_freq, // 0.0 = disabled
- float penalty_present, // 0.0 = disabled
- bool penalize_nl, // consider newlines as a repeatable token
- bool ignore_eos // ignore the end-of-sequence token
+ int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
+ float penalty_repeat, // 1.0 = disabled
+ float penalty_freq, // 0.0 = disabled
+ float penalty_present // 0.0 = disabled
);
// ReSharper restore InconsistentNaming
}
+ ///
+ /// DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677.
+ /// Porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+ ///
+ /// The model this sampler will be used with
+ ///
+ /// penalty multiplier, 0.0 = disabled
+ /// exponential base
+ /// repeated sequences longer than this are penalized
+ /// how many tokens to scan for repetitions (0 = entire context)
+ public void AddDry(SafeLlamaModelHandle model, ReadOnlySpan sequenceBreakers, float multiplier = 0.8f, float @base = 1.75f, int allowedLength = 2, int penaltyLastN = 0)
+ {
+ unsafe
+ {
+ // Convert strings, fix memory in place, build array of pointers
+ var handles = new List();
+ var breakers = stackalloc byte*[sequenceBreakers.Length];
+ for (var i = 0; i < sequenceBreakers.Length; i++)
+ {
+ var chars = Encoding.Default.GetBytes(sequenceBreakers[i]);
+ handles.Add(chars.AsMemory().Pin());
+
+ breakers[i] = (byte*)handles[i].Pointer;
+ }
+
+ llama_sampler_chain_add(
+ this,
+ llama_sampler_init_dry(
+ model,
+ multiplier,
+ @base,
+ allowedLength,
+ penaltyLastN,
+ breakers,
+ (nuint)sequenceBreakers.Length
+ )
+ );
+
+ // Clear up all the handles fixing the memory in place
+ for (var i = 0; i < handles.Count; i++)
+ handles[i].Dispose();
+ }
+
+ // ReSharper disable InconsistentNaming
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ static extern unsafe IntPtr llama_sampler_init_dry(
+ SafeLlamaModelHandle model,
+ float dry_multiplier,
+ float dry_base,
+ int dry_allowed_length,
+ int dry_penalty_last_n,
+ byte** seq_breakers,
+ nuint num_breakers
+ );
+ }
+
///
/// Create a sampler that applies a bias directly to the logits
///
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 718b81809..303ae3352 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -441,9 +441,6 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token);
- //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name);
-
///
/// Returns true if the model contains an encoder that requires llama_encode() call
///
diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs
index 3d166f0c6..76404bc95 100644
--- a/LLama/Sampling/DefaultSamplingPipeline.cs
+++ b/LLama/Sampling/DefaultSamplingPipeline.cs
@@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline
///
public float RepeatPenalty { get; init; } = 1;
- ///
- /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
- /// so far, decreasing the model's likelihood to repeat the same line verbatim.
- ///
- [Obsolete($"Use {nameof(FrequencyPenalty)} instead.")]
- public float AlphaFrequency
- {
- get => _frequencyPenalty;
- init
- {
- if (value < -2)
- throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2");
- if (value > 2)
- throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2");
- _frequencyPenalty = value;
- }
- }
-
- ///
- /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
- /// text so far, increasing the model's likelihood to talk about new topics.
- ///
- [Obsolete($"Use {nameof(PresencePenalty)} instead.")]
- public float AlphaPresence
- {
- get => _presencePenalty;
- init
- {
- if (value < -2)
- throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2");
- if (value > 2)
- throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2");
- _presencePenalty = value;
- }
- }
-
///
/// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
@@ -97,21 +59,15 @@ public float PresencePenalty
private readonly float _presencePenalty;
///
- /// How many tokens should be considered for penalizing repetition
+ /// How many tokens should be considered for penalties
///
- public int RepeatPenaltyCount { get; init; } = 64;
+ public int PenaltyCount { get; init; } = 64;
///
/// Whether the newline token should be protected from being modified by penalty
///
public bool PenalizeNewline { get; init; } = false;
- ///
- /// Whether the EOS token should be protected from being modified by penalty
- ///
- [Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")]
- public bool PenalizeEOS { get; init; } = false;
-
///
/// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled
///
@@ -158,7 +114,7 @@ public float PresencePenalty
public uint Seed { get; set; } = GetRandomSeed();
- private static Random RandomSeedGenerator = new();
+ private static readonly Random RandomSeedGenerator = new();
private static uint GetRandomSeed()
{
lock (RandomSeedGenerator)
@@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
if (Grammar != null)
chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root);
- chain.AddPenalties(
- context.VocabCount,
- context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0,
- RepeatPenaltyCount, RepeatPenalty,
- FrequencyPenalty, PresencePenalty,
- PenalizeNewline, PreventEOS
- );
+ chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty);
chain.AddTopK(TopK);
chain.AddTypical(TypicalP, MinKeep);
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index 0203aad2b..debc99506 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -18,46 +18,77 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
index 7b4f959f4..6abd16ccc 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
@@ -7,18 +7,27 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
index 34bc6781d..a412e2e6f 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
@@ -7,18 +7,27 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
index 1beeeaafc..5ac473914 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
@@ -22,6 +22,7 @@
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
index 8834ae413..687283221 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
@@ -7,19 +7,27 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Cuda12.Linux contains the Linux binaries for LLamaSharp with Cuda12 support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
-
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
index 3d37accec..1fd01edb9 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
@@ -7,19 +7,27 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Cuda12.Windows contains the Windows binaries for LLamaSharp with Cuda12 support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
-
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
index 725764097..3f2202db4 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
@@ -7,18 +7,27 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Vulkan.Linux contains the Linux binaries for LLamaSharp with Vulkan support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
index 5c5b83f94..3f7487bcd 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
@@ -7,18 +7,28 @@
llama.cpp Authors
false
MIT
+ icon512.png
https://github.com/SciSharp/LLamaSharp
LLamaSharp.Backend.Vulkan.Windows contains the Windows binaries for LLamaSharp with Vulkan support.
Copyright 2023 The llama.cpp Authors. All rights reserved.
LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
-
+
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
index b4f26ec97..c972ad0fc 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
@@ -22,6 +22,7 @@
+
-
+
\ No newline at end of file