From ac29c34e9bfc630944b28b81605375dbe5836050 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 3 Dec 2024 16:42:39 +0000 Subject: [PATCH 01/22] code changes for december update (not working yet) --- LLama.Examples/Program.cs | 6 +- LLama/Extensions/IModelParamsExtensions.cs | 3 +- LLama/LLamaSharp.csproj | 2 +- LLama/Native/GPUSplitMode.cs | 2 +- LLama/Native/LLamaModelParams.cs | 8 ++- LLama/Native/SafeLLamaContextHandle.cs | 17 ++++- LLama/Native/SafeLLamaSamplerHandle.cs | 73 +++++++++++++++++++++- LLama/Sampling/DefaultSamplingPipeline.cs | 2 +- 8 files changed, 101 insertions(+), 12 deletions(-) diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs index 63114120d..ec4e20b03 100644 --- a/LLama.Examples/Program.cs +++ b/LLama.Examples/Program.cs @@ -1,4 +1,4 @@ -using LLama.Native; +using LLama.Native; using Spectre.Console; using System.Runtime.InteropServices; @@ -30,8 +30,8 @@ __ __ ____ __ // Configure native library to use. This must be done before any other llama.cpp methods are called! NativeLibraryConfig .All - .WithCuda() - //.WithAutoDownload() // An experimental feature + .WithCuda(false) + .WithVulkan(false) .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary); // Calling this method forces loading to occur now. diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs index 523ec737a..d704b2e8c 100644 --- a/LLama/Extensions/IModelParamsExtensions.cs +++ b/LLama/Extensions/IModelParamsExtensions.cs @@ -1,4 +1,4 @@ -using System.IO; +using System.IO; using System; using System.Text; using LLama.Abstractions; @@ -30,6 +30,7 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam result = LLamaModelParams.Default(); + result.devices = IntPtr.Zero; result.main_gpu = @params.MainGpu; result.split_mode = @params.SplitMode; result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 735fa81a5..189c3d94a 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - 958367bf530d943a90 + c9b00a70b080d diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs index 54fa095c1..27ee7ae49 100644 --- a/LLama/Native/GPUSplitMode.cs +++ b/LLama/Native/GPUSplitMode.cs @@ -17,7 +17,7 @@ public enum GPUSplitMode Layer = 1, /// - /// split rows across GPUs + /// split layers and KV across GPUs, use tensor parallelism if supported /// Row = 2, } \ No newline at end of file diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index c0437d9db..e3394892e 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -8,6 +8,12 @@ namespace LLama.Native [StructLayout(LayoutKind.Sequential)] public unsafe struct LLamaModelParams { + /// + /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) + /// + //ggml_backend_dev_t* devices; + public IntPtr devices; + /// /// // number of layers to store in VRAM /// @@ -19,7 +25,7 @@ public unsafe struct LLamaModelParams public GPUSplitMode split_mode; /// - /// the GPU that is used for scratch and small tensors + /// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE /// public int main_gpu; diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 450f4998a..19187ded9 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -333,6 +333,14 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx); + /// + /// Check if the context supports KV cache shifting + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern bool llama_kv_cache_can_shift(SafeLLamaContextHandle ctx); + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx); @@ -566,7 +574,7 @@ public void Synchronize() /// internally for later use by the decoder cross-attention layers. /// /// - /// 0 = success
< 0 = error
+ /// 0 = success
< 0 = error (the KV cache state is restored to the state before this call)
public DecodeResult Encode(LLamaBatch batch) { if (batch.TokenCount == 0) @@ -583,7 +591,7 @@ public DecodeResult Encode(LLamaBatch batch) /// Positive return values does not mean a fatal error, but rather a warning:
/// - 0: success
/// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
+ /// - < 0: error (the KV cache state is restored to the state before this call)
///
public DecodeResult Decode(LLamaBatch batch) { @@ -746,6 +754,11 @@ public void ResetTimings() #endregion #region KV Cache Management + /// + /// Check if the context supports KV cache shifting + /// + public bool KvCacheCanShift => llama_kv_cache_can_shift(this); + /// /// Apply KV cache updates (such as K-shifts, defragmentation, etc.) /// diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs index ef6a7ae30..3518d182e 100644 --- a/LLama/Native/SafeLLamaSamplerHandle.cs +++ b/LLama/Native/SafeLLamaSamplerHandle.cs @@ -1,5 +1,5 @@ using System; -using System.Runtime.CompilerServices; +using System.Collections.Generic; using System.Text; namespace LLama.Native; @@ -426,7 +426,20 @@ public void AddPenalties( int vocabSize, LLamaToken? eos, LLamaToken newline, int penaltyCount, float repeat, float freq, float presence, bool penalizeNewline, bool ignoreEOS ) { - llama_sampler_chain_add(this, llama_sampler_init_penalties(vocabSize, eos ?? LLamaToken.InvalidToken, newline, penaltyCount, repeat, freq, presence, penalizeNewline, ignoreEOS)); + llama_sampler_chain_add( + this, + llama_sampler_init_penalties( + vocabSize, + eos ?? LLamaToken.InvalidToken, + newline, + penaltyCount, + repeat, + freq, + presence, + penalizeNewline, + ignoreEOS + ) + ); // ReSharper disable InconsistentNaming [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] @@ -444,6 +457,62 @@ bool ignore_eos // ignore the end-of-sequence token // ReSharper restore InconsistentNaming } + /// + /// DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677. + /// Porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 + /// + /// The model this sampler will be used with + /// + /// penalty multiplier, 0.0 = disabled + /// exponential base + /// repeated sequences longer than this are penalized + /// how many tokens to scan for repetitions (0 = entire context) + public void AddDry(SafeLlamaModelHandle model, ReadOnlySpan sequenceBreakers, float multiplier = 0.8f, float @base = 1.75f, int allowedLength = 2, int penaltyLastN = 0) + { + unsafe + { + // Convert strings, fix memory in place, build array of pointers + var handles = new List(); + var breakers = stackalloc byte*[sequenceBreakers.Length]; + for (var i = 0; i < sequenceBreakers.Length; i++) + { + var chars = Encoding.Default.GetBytes(sequenceBreakers[i]); + handles.Add(chars.AsMemory().Pin()); + + breakers[i] = (byte*)handles[i].Pointer; + } + + llama_sampler_chain_add( + this, + llama_sampler_init_dry( + model, + multiplier, + @base, + allowedLength, + penaltyLastN, + breakers, + (nuint)sequenceBreakers.Length + ) + ); + + // Clear up all the handles fixing the memory in place + for (var i = 0; i < handles.Count; i++) + handles[i].Dispose(); + } + + // ReSharper disable InconsistentNaming + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + static extern unsafe IntPtr llama_sampler_init_dry( + SafeLlamaModelHandle model, + float dry_multiplier, + float dry_base, + int dry_allowed_length, + int dry_penalty_last_n, + byte** seq_breakers, + nuint num_breakers + ); + } + /// /// Create a sampler that applies a bias directly to the logits /// diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs index 3d166f0c6..2e3395db7 100644 --- a/LLama/Sampling/DefaultSamplingPipeline.cs +++ b/LLama/Sampling/DefaultSamplingPipeline.cs @@ -158,7 +158,7 @@ public float PresencePenalty public uint Seed { get; set; } = GetRandomSeed(); - private static Random RandomSeedGenerator = new(); + private static readonly Random RandomSeedGenerator = new(); private static uint GetRandomSeed() { lock (RandomSeedGenerator) From e4f4feda1cfb99ac3601165022143ccac07fc493 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Fri, 20 Dec 2024 01:08:43 +0000 Subject: [PATCH 02/22] Changes to support up to https://github.com/ggerganov/llama.cpp/commit/d408bb9268a988c5a60a5746d3a6430386e7604d --- LLama/Native/LLamaFtype.cs | 24 +++++++++---------- LLama/Native/LLamaRopeType.cs | 2 ++ LLama/Native/NativeApi.cs | 9 +++++++ LLama/Native/RopeScalingType.cs | 7 +++++- LLama/Native/SafeLLamaSamplerHandle.cs | 33 +++++++------------------- LLama/Native/SafeLlamaModelHandle.cs | 3 --- 6 files changed, 38 insertions(+), 40 deletions(-) diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 6970a4728..705f8032e 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -177,20 +177,20 @@ public enum LLamaFtype /// MOSTLY_BF16 = 32, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_4_4 = 33, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_4_4 = 33, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_4_8 = 34, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_4_8 = 34, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_8_8 = 35, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_8_8 = 35, /// /// except 1d tensors diff --git a/LLama/Native/LLamaRopeType.cs b/LLama/Native/LLamaRopeType.cs index ebad9e77b..3f1188112 100644 --- a/LLama/Native/LLamaRopeType.cs +++ b/LLama/Native/LLamaRopeType.cs @@ -9,4 +9,6 @@ public enum LLamaRopeType None = -1, Norm = 0, NEOX = 2,//GGML_ROPE_TYPE_NEOX, + //todo:LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE, + //todo:LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, } \ No newline at end of file diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 041cc0dd5..0d6bc1984 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -172,6 +172,15 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model, static extern int internal_llama_chat_apply_template(IntPtr model, byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length); } + /// + /// Get list of built-in chat templates + /// + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe int llama_chat_builtin_templates(char** output, nuint len); + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.U1)] public static extern bool llama_add_bos_token(SafeLlamaModelHandle model); diff --git a/LLama/Native/RopeScalingType.cs b/LLama/Native/RopeScalingType.cs index 8d4552b80..61ae82942 100644 --- a/LLama/Native/RopeScalingType.cs +++ b/LLama/Native/RopeScalingType.cs @@ -1,4 +1,4 @@ -namespace LLama.Native +namespace LLama.Native { /// /// RoPE scaling type. @@ -26,5 +26,10 @@ public enum RopeScalingType /// YaRN scaling: https://arxiv.org/pdf/2309.00071.pdf /// Yarn = 2, + + /// + /// LongRope scaling + /// + LongRope = 3, } } diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs index 3518d182e..9099c2f32 100644 --- a/LLama/Native/SafeLLamaSamplerHandle.cs +++ b/LLama/Native/SafeLLamaSamplerHandle.cs @@ -410,49 +410,34 @@ public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root) } /// - /// Create a sampler that applies various repetition penalties + /// Create a sampler that applies various repetition penalties. + /// + /// Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. /// - /// Vocab size - /// EOS token (if this model has one) - /// Newline token /// How many tokens of history to consider when calculating penalties /// Repetition penalty /// Frequency penalty /// Presence penalty - /// Whether or not to penalize the newline token - /// Whether or not to ignore EOS token /// - public void AddPenalties( - int vocabSize, LLamaToken? eos, LLamaToken newline, int penaltyCount, float repeat, float freq, float presence, bool penalizeNewline, bool ignoreEOS - ) + public void AddPenalties(int penaltyCount, float repeat, float freq, float presence) { llama_sampler_chain_add( this, llama_sampler_init_penalties( - vocabSize, - eos ?? LLamaToken.InvalidToken, - newline, penaltyCount, repeat, freq, - presence, - penalizeNewline, - ignoreEOS + presence ) ); // ReSharper disable InconsistentNaming [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] static extern IntPtr llama_sampler_init_penalties( - int n_vocab, // llama_n_vocab() - LLamaToken special_eos_id, // llama_token_eos() - LLamaToken linefeed_id, // llama_token_nl() - int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat, // 1.0 = disabled - float penalty_freq, // 0.0 = disabled - float penalty_present, // 0.0 = disabled - bool penalize_nl, // consider newlines as a repeatable token - bool ignore_eos // ignore the end-of-sequence token + int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat, // 1.0 = disabled + float penalty_freq, // 0.0 = disabled + float penalty_present // 0.0 = disabled ); // ReSharper restore InconsistentNaming } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 718b81809..303ae3352 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -441,9 +441,6 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token); - //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name); - /// /// Returns true if the model contains an encoder that requires llama_encode() call /// From c90ddd989a47eb0c36f5e49c8d9f415b7681056d Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Fri, 27 Dec 2024 21:08:35 +0000 Subject: [PATCH 03/22] Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends --- LLama.Examples/Program.cs | 6 +- LLama.Web/Common/ModelOptions.cs | 5 +- LLama/Abstractions/IModelParams.cs | 7 ++- LLama/Common/ModelParams.cs | 5 +- LLama/Extensions/IModelParamsExtensions.cs | 6 +- LLama/Extensions/LLamaExecutorExtensions.cs | 2 +- LLama/LLamaQuantizer.cs | 3 - LLama/LLamaSharp.Runtime.targets | 36 ++++++++++++ LLama/LLamaSharp.csproj | 2 +- LLama/LLamaStatelessExecutor.cs | 12 ++-- LLama/Native/LLamaModelParams.cs | 14 ++--- LLama/Native/Load/NativeLibraryConfig.cs | 4 +- LLama/Sampling/DefaultSamplingPipeline.cs | 56 +------------------ .../build/LLamaSharp.Backend.Cpu.nuspec | 15 +++-- 14 files changed, 87 insertions(+), 86 deletions(-) diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs index ec4e20b03..d4c3bae15 100644 --- a/LLama.Examples/Program.cs +++ b/LLama.Examples/Program.cs @@ -1,6 +1,5 @@ using LLama.Native; using Spectre.Console; -using System.Runtime.InteropServices; AnsiConsole.MarkupLineInterpolated( $""" @@ -30,9 +29,8 @@ __ __ ____ __ // Configure native library to use. This must be done before any other llama.cpp methods are called! NativeLibraryConfig .All - .WithCuda(false) - .WithVulkan(false) - .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary); + .WithCuda() + .WithVulkan(); // Calling this method forces loading to occur now. NativeApi.llama_empty_call(); diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 4e002c93f..a67a11a96 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -24,7 +24,7 @@ public class ModelOptions public int MainGpu { get; set; } = 0; /// - public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + public GPUSplitMode? SplitMode { get; set; } /// public int GpuLayerCount { get; set; } = 20; @@ -59,6 +59,9 @@ public class ModelOptions /// public TensorSplitsCollection TensorSplits { get; set; } = new(); + /// + public bool CheckTensors { get; } + /// public List MetadataOverrides { get; } = new(); diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs index 7dc28f671..cbbacafe5 100644 --- a/LLama/Abstractions/IModelParams.cs +++ b/LLama/Abstractions/IModelParams.cs @@ -36,7 +36,7 @@ public interface IModelParams /// /// How to split the model across multiple GPUs /// - GPUSplitMode SplitMode { get; } + GPUSplitMode? SplitMode { get; } /// /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) @@ -68,6 +68,11 @@ public interface IModelParams /// bool VocabOnly { get; } + /// + /// Validate model tensor data before loading + /// + bool CheckTensors { get; } + /// /// Override specific metadata items in the model /// diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index b276ed73a..7e4b1a967 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -19,7 +19,7 @@ public record ModelParams public int MainGpu { get; set; } = 0; /// - public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + public GPUSplitMode? SplitMode { get; set; } /// public int GpuLayerCount { get; set; } = 20; @@ -54,6 +54,9 @@ public record ModelParams /// public TensorSplitsCollection TensorSplits { get; set; } = new(); + /// + public bool CheckTensors { get; } + /// public List MetadataOverrides { get; set; } = new(); diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs index d704b2e8c..588564e33 100644 --- a/LLama/Extensions/IModelParamsExtensions.cs +++ b/LLama/Extensions/IModelParamsExtensions.cs @@ -30,13 +30,15 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam result = LLamaModelParams.Default(); - result.devices = IntPtr.Zero; result.main_gpu = @params.MainGpu; - result.split_mode = @params.SplitMode; result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; + if (@params.SplitMode.HasValue) + result.split_mode = @params.SplitMode.Value; + result.use_mlock = @params.UseMemoryLock; result.use_mmap = @params.UseMemorymap; result.vocab_only = @params.VocabOnly; + result.check_tensors = @params.CheckTensors; unsafe { diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs index 19c8d33df..e38ccf98d 100644 --- a/LLama/Extensions/LLamaExecutorExtensions.cs +++ b/LLama/Extensions/LLamaExecutorExtensions.cs @@ -147,7 +147,7 @@ private string CreatePrompt(IList messages) PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS, PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline, RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty, - RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount, + PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount, Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar, MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep, MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP, diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs index 23f0f8b4e..9e90b732e 100644 --- a/LLama/LLamaQuantizer.cs +++ b/LLama/LLamaQuantizer.cs @@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype) case LLamaFtype.MOSTLY_IQ3_S: case LLamaFtype.MOSTLY_IQ3_M: - case LLamaFtype.MOSTLY_Q4_0_4_4: - case LLamaFtype.MOSTLY_Q4_0_4_8: - case LLamaFtype.MOSTLY_Q4_0_8_8: return true; case LLamaFtype.GUESSED: diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 6466a1204..fd3b3061e 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -12,6 +12,15 @@ PreserveNewest runtimes/win-x64/native/noavx/ggml.dll + + PreserveNewest + runtimes/win-x64/native/noavx/ggml-base.dll + + + PreserveNewest + runtimes/win-x64/native/noavx/ggml-cpu.dll + + PreserveNewest runtimes/win-x64/native/avx/llama.dll @@ -20,22 +29,49 @@ PreserveNewest runtimes/win-x64/native/avx/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx/ggml-base.dll + + + PreserveNewest + runtimes/win-x64/native/avx/ggml-cpu.dll + + PreserveNewest runtimes/win-x64/native/avx2/llama.dll + + PreserveNewest + runtimes/win-x64/native/avx2/ggml-base.dll + PreserveNewest runtimes/win-x64/native/avx2/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx2/ggml-cpu.dll + + PreserveNewest runtimes/win-x64/native/avx512/llama.dll + + PreserveNewest + runtimes/win-x64/native/avx512/ggml-base.dll + PreserveNewest runtimes/win-x64/native/avx512/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx512/ggml-cpu.dll + + PreserveNewest runtimes/win-x64/native/cuda11/llama.dll diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 189c3d94a..0ce3714ca 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - c9b00a70b080d + d79d8f39b4da6 diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs index a4b9920dc..393bebc3f 100644 --- a/LLama/LLamaStatelessExecutor.cs +++ b/LLama/LLamaStatelessExecutor.cs @@ -23,14 +23,14 @@ public class StatelessExecutor private readonly ILogger? _logger; private readonly LLamaBatch _batch; - // LLava Section + /// public bool IsMultiModal => false; /// - public LLavaWeights? ClipModel { get; } + public LLavaWeights? ClipModel => default; /// - public List Images { get; set; } + public List Images { get; } /// /// The context used by the executor when running the inference. @@ -68,7 +68,7 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams Context = context; // Reset the sampling pipeline (if there is one) - inferenceParams?.SamplingPipeline?.Reset(); + inferenceParams?.SamplingPipeline.Reset(); // Sanity check inference params inferenceParams ??= new InferenceParams(); @@ -134,8 +134,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams var n_left = n_past - tokensKeep; var n_discard = n_left / 2; - NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard); - NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard); + NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard); + NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard); n_past -= n_discard; } diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index e3394892e..e16e3263e 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -10,9 +10,9 @@ public unsafe struct LLamaModelParams { /// /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) + /// todo: add support for llama_model_params.devices /// - //ggml_backend_dev_t* devices; - public IntPtr devices; + private IntPtr devices; /// /// // number of layers to store in VRAM @@ -32,12 +32,12 @@ public unsafe struct LLamaModelParams /// /// how to split layers across multiple GPUs (size: ) /// - public float* tensor_split; - - /// - /// comma separated list of RPC servers to use for offloading + public float* tensor_split; + + /// + /// comma separated list of RPC servers to use for offloading /// - public byte* rpc_servers; + public byte* rpc_servers; /// /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs index 02e47b695..2bfa0554b 100644 --- a/LLama/Native/Load/NativeLibraryConfig.cs +++ b/LLama/Native/Load/NativeLibraryConfig.cs @@ -178,7 +178,7 @@ internal Description CheckAndGatherDescription() _avxLevel, _allowFallback, _skipCheck, - _searchDirectories.Concat(new[] { "./" }).ToArray() + _searchDirectories.Concat([ "./" ]).ToArray() ); } @@ -186,7 +186,7 @@ internal static string AvxLevelToString(AvxLevel level) { return level switch { - AvxLevel.None => string.Empty, + AvxLevel.None => "noavx", AvxLevel.Avx => "avx", AvxLevel.Avx2 => "avx2", AvxLevel.Avx512 => "avx512", diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs index 2e3395db7..76404bc95 100644 --- a/LLama/Sampling/DefaultSamplingPipeline.cs +++ b/LLama/Sampling/DefaultSamplingPipeline.cs @@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline /// public float RepeatPenalty { get; init; } = 1; - /// - /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text - /// so far, decreasing the model's likelihood to repeat the same line verbatim. - ///
- [Obsolete($"Use {nameof(FrequencyPenalty)} instead.")] - public float AlphaFrequency - { - get => _frequencyPenalty; - init - { - if (value < -2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2"); - if (value > 2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2"); - _frequencyPenalty = value; - } - } - - /// - /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the - /// text so far, increasing the model's likelihood to talk about new topics. - ///
- [Obsolete($"Use {nameof(PresencePenalty)} instead.")] - public float AlphaPresence - { - get => _presencePenalty; - init - { - if (value < -2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2"); - if (value > 2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2"); - _presencePenalty = value; - } - } - /// /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text @@ -97,21 +59,15 @@ public float PresencePenalty private readonly float _presencePenalty; /// - /// How many tokens should be considered for penalizing repetition + /// How many tokens should be considered for penalties /// - public int RepeatPenaltyCount { get; init; } = 64; + public int PenaltyCount { get; init; } = 64; /// /// Whether the newline token should be protected from being modified by penalty /// public bool PenalizeNewline { get; init; } = false; - /// - /// Whether the EOS token should be protected from being modified by penalty - /// - [Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")] - public bool PenalizeEOS { get; init; } = false; - /// /// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled /// @@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl if (Grammar != null) chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root); - chain.AddPenalties( - context.VocabCount, - context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0, - RepeatPenaltyCount, RepeatPenalty, - FrequencyPenalty, PresencePenalty, - PenalizeNewline, PreventEOS - ); + chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty); chain.AddTopK(TopK); chain.AddTypical(TypicalP, MinKeep); diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 0203aad2b..ab8d11c04 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -18,13 +18,20 @@ - + + - + + + - + + + - + + + From c27cfde13d44a22a3a2022a2bfd19fdf62b8d889 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 4 Jan 2025 00:46:22 +0000 Subject: [PATCH 04/22] Updated to latest deps, fixed kernel memory failing to load --- .../LLamaSharpTextEmbeddingGenerator.cs | 4 +--- LLama.KernelMemory/LlamaSharpTextGenerator.cs | 2 -- .../LLamaSharpTextEmbeddingGeneratorTests.cs | 14 ++++---------- .../LlamaSharpTextGeneratorTests.cs | 18 ++++-------------- LLama.Unittest/SamplingTests.cs | 6 +----- LLama/LLamaSharp.csproj | 2 +- 6 files changed, 11 insertions(+), 35 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 7f9ae1e4d..6efd44f7b 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config) var @params = new ModelParams(config.ModelPath) { - ContextSize = config.ContextSize ?? 2048, + ContextSize = config.ContextSize, GpuLayerCount = config.GpuLayerCount ?? 20, Embeddings = true, - MainGpu = config.MainGpu, - SplitMode = config.SplitMode, PoolingType = LLamaPoolingType.Mean, }; _weights = LLamaWeights.LoadFromFile(@params); diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index adfc89317..3fc96db9a 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config) { ContextSize = config.ContextSize ?? 2048, GpuLayerCount = config.GpuLayerCount ?? 20, - MainGpu = config.MainGpu, - SplitMode = config.SplitMode }; _weights = LLamaWeights.LoadFromFile(parameters); _context = _weights.CreateContext(parameters); diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs index 91161b72c..5c7b4213d 100644 --- a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs +++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs @@ -1,21 +1,15 @@ -using LLama.Common; using LLamaSharp.KernelMemory; -using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; using Xunit.Abstractions; namespace LLama.Unittest.KernelMemory { - public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable + public class LLamaSharpTextEmbeddingGeneratorTests + : ITextTokenizerTests, IDisposable { private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; - public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) + : base(testOutputHelper) { _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs index 02001f8cf..d21d7f959 100644 --- a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs +++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs @@ -1,25 +1,15 @@ -using LLama.Common; using LLamaSharp.KernelMemory; -using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Reflection.Emit; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; using Xunit.Abstractions; -using Xunit.Sdk; -using static System.Net.Mime.MediaTypeNames; namespace LLama.Unittest.KernelMemory { - public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable + public class LlamaSharpTextGeneratorTests + : ITextTokenizerTests, IDisposable { private readonly LlamaSharpTextGenerator _textGenerator; - public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) + : base(testOutputHelper) { _textGenerator = new LlamaSharpTextGenerator(_lsConfig); diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs index f322bc250..bae7e3dea 100644 --- a/LLama.Unittest/SamplingTests.cs +++ b/LLama.Unittest/SamplingTests.cs @@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default()); chain.AddPenalties( - vocabSize: context.VocabCount, - eos: context.ModelHandle.Tokens.EOS, - newline: context.ModelHandle.Tokens.Newline ?? 0, - penaltyCount: 60, repeat: 1, freq: 0, presence: 0, - penalizeNewline: false, ignoreEOS: false + penaltyCount: 60, repeat: 1, freq: 0, presence: 0 ); if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); } diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 0ce3714ca..0e51077e4 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - d79d8f39b4da6 + 0827b2c1da From a5c97594c9f4f1ba3f685e5a0435b7daec3056df Mon Sep 17 00:00:00 2001 From: SignalRT Date: Sat, 4 Jan 2025 15:28:54 +0100 Subject: [PATCH 05/22] Copy missing Mac flibraries libggml-base and libggml-cpu --- LLama/LLamaSharp.Runtime.targets | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index fd3b3061e..e58b9e89f 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -154,6 +154,14 @@ runtimes/linux-x64/native/vulkan/libggml.so + + PreserveNewest + runtimes/osx-arm64/native/libggml-base.dylib + + + PreserveNewest + runtimes/osx-arm64/native/libggml-cpu.dylib + PreserveNewest runtimes/osx-arm64/native/libggml.dylib @@ -170,7 +178,15 @@ PreserveNewest runtimes/osx-arm64/native/ggml-metal.metal - + + + PreserveNewest + runtimes/osx-x64/native/libggml-base.dylib + + + PreserveNewest + runtimes/osx-x64/native/libggml-cpu.dylib + PreserveNewest runtimes/osx-x64/native/libggml.dylib @@ -184,6 +200,14 @@ runtimes/osx-x64/native/libllava_shared.dylib + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-base.dylib + + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib + PreserveNewest runtimes/osx-x64/native/rosetta2/libggml.dylib From 34198f901a341286374a7d25fc71956166883c45 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 4 Jan 2025 21:18:59 +0000 Subject: [PATCH 06/22] Removed any mention of AVX in MacOS loading --- .../DefaultNativeLibrarySelectingPolicy.cs | 4 +-- .../Load/NativeLibraryWithMacOrFallback.cs | 31 ++++++------------- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs index 6f5ad35fe..497902ba1 100644 --- a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs +++ b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs @@ -10,8 +10,6 @@ public class DefaultNativeLibrarySelectingPolicy: INativeLibrarySelectingPolicy /// public IEnumerable Apply(NativeLibraryConfig.Description description, SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback) { - List results = new(); - // Show the configuration we're working with Log(description.ToString(), LLamaLogLevel.Info, logCallback); @@ -56,7 +54,7 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip if(systemInfo.OSPlatform == OSPlatform.OSX || description.AllowFallback) { - yield return new NativeLibraryWithMacOrFallback(description.Library, description.SkipCheck); + yield return new NativeLibraryWithMacOrFallback(description.Library); } } } diff --git a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs index 6bcd55049..59754be03 100644 --- a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs +++ b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs @@ -1,5 +1,5 @@ -using LLama.Abstractions; using System.Collections.Generic; +using LLama.Abstractions; namespace LLama.Native { @@ -7,39 +7,30 @@ namespace LLama.Native /// /// A native library compiled on Mac, or fallbacks from all other libraries in the selection. /// - public class NativeLibraryWithMacOrFallback : INativeLibrary + public class NativeLibraryWithMacOrFallback + : INativeLibrary { - private NativeLibraryName _libraryName; - private bool _skipCheck; + private readonly NativeLibraryName _libraryName; /// - public NativeLibraryMetadata? Metadata - { - get - { - return new NativeLibraryMetadata(_libraryName, false, false, AvxLevel.None); - } - } + public NativeLibraryMetadata Metadata => new(_libraryName, false, false, AvxLevel.None); /// /// /// /// - /// - public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName) { _libraryName = libraryName; - _skipCheck = skipCheck; } /// public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback) { - var path = GetPath(systemInfo, AvxLevel.None, logCallback); - return path is null ?[] : [path]; + yield return GetPath(systemInfo); } - private string? GetPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) + private string GetPath(SystemInfo systemInfo) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); string relativePath; @@ -50,11 +41,7 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL } else { - var avxStr = NativeLibraryConfig.AvxLevelToString(AvxLevel.None); - if (!string.IsNullOrEmpty(avxStr)) - avxStr += "/"; - - relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; } return relativePath; From 3d931749774be66d58bfa451af104ae1aae79c31 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 11 Jan 2025 00:37:07 +0000 Subject: [PATCH 07/22] Added file copying for some more targets (still missing macos) --- LLama/LLamaSharp.Runtime.targets | 103 ++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index e58b9e89f..5a49ccc76 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -21,6 +21,7 @@ runtimes/win-x64/native/noavx/ggml-cpu.dll + PreserveNewest runtimes/win-x64/native/avx/llama.dll @@ -38,6 +39,7 @@ runtimes/win-x64/native/avx/ggml-cpu.dll + PreserveNewest runtimes/win-x64/native/avx2/llama.dll @@ -55,6 +57,7 @@ runtimes/win-x64/native/avx2/ggml-cpu.dll + PreserveNewest runtimes/win-x64/native/avx512/llama.dll @@ -72,30 +75,60 @@ runtimes/win-x64/native/avx512/ggml-cpu.dll + PreserveNewest runtimes/win-x64/native/cuda11/llama.dll + + PreserveNewest + runtimes/win-x64/native/cuda11/ggml-base.dll + PreserveNewest runtimes/win-x64/native/cuda11/ggml.dll + + PreserveNewest + runtimes/win-x64/native/cuda11/ggml-cuda.dll + + + PreserveNewest runtimes/win-x64/native/cuda12/llama.dll + + PreserveNewest + runtimes/win-x64/native/cuda12/ggml-base.dll + PreserveNewest - runtimes/win-x64/native/cuda12/ggml.dll + runtimes/win-x64/native/cuda11/ggml.dll + + + PreserveNewest + runtimes/win-x64/native/cuda11/ggml-cuda.dll + + PreserveNewest runtimes/win-x64/native/vulkan/llama.dll + + PreserveNewest + runtimes/win-x64/native/vulkan/ggml-base.dll + PreserveNewest runtimes/win-x64/native/vulkan/ggml.dll + + PreserveNewest + runtimes/win-x64/native/vulkan/ggml-vulkan.dll + + PreserveNewest @@ -105,6 +138,16 @@ PreserveNewest runtimes/linux-x64/native/noavx/libggml.so + + PreserveNewest + runtimes/linux-x64/native/noavx/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/noavx/libggml-cpu.so + + + PreserveNewest runtimes/linux-x64/native/avx/libllama.so @@ -113,6 +156,17 @@ PreserveNewest runtimes/linux-x64/native/avx/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx/libggml-cpu.so + + + + PreserveNewest runtimes/linux-x64/native/avx2/libllama.so @@ -121,6 +175,15 @@ PreserveNewest runtimes/linux-x64/native/avx2/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx2/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx2/libggml-cpu.so + + PreserveNewest runtimes/linux-x64/native/avx512/libllama.so @@ -129,6 +192,15 @@ PreserveNewest runtimes/linux-x64/native/avx512/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx512/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx512/libggml-cpu.so + + PreserveNewest runtimes/linux-x64/native/cuda11/libllama.so @@ -137,6 +209,16 @@ PreserveNewest runtimes/linux-x64/native/cuda11/libggml.so + + PreserveNewest + runtimes/linux-x64/native/cuda11/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/cuda11/libggml-cuda.so + + + PreserveNewest runtimes/linux-x64/native/cuda12/libllama.so @@ -145,6 +227,16 @@ PreserveNewest runtimes/linux-x64/native/cuda12/libggml.so + + PreserveNewest + runtimes/linux-x64/native/cuda12/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/cuda12/libggml-cuda.so + + + PreserveNewest runtimes/linux-x64/native/vulkan/libllama.so @@ -153,6 +245,15 @@ PreserveNewest runtimes/linux-x64/native/vulkan/libggml.so + + PreserveNewest + runtimes/linux-x64/native/vulkan/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/vulkan/libggml-vulkan.so + + PreserveNewest From 0647df9b1e06df73beb636e0e0807392ae33a6a7 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 11 Jan 2025 21:46:06 +0000 Subject: [PATCH 08/22] Updated to latest set of binaries --- LLama.Examples/Program.cs | 2 +- LLama/LLamaSharp.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs index d4c3bae15..f8ef7d5aa 100644 --- a/LLama.Examples/Program.cs +++ b/LLama.Examples/Program.cs @@ -17,7 +17,7 @@ __ __ ____ __ """); // Configure logging. Change this to `true` to see log messages from llama.cpp -var showLLamaCppLogs = false; +var showLLamaCppLogs = true; NativeLibraryConfig .All .WithLogCallback((level, message) => diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 0e51077e4..4d0d19eb3 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - 0827b2c1da + 0827b2c1da-v2 From 756a88f439e022744ff7322b9dedaee4a1d2203e Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 11 Jan 2025 21:52:24 +0000 Subject: [PATCH 09/22] Fixed copy path for CUDA12 DLLs --- LLama/LLamaSharp.Runtime.targets | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 5a49ccc76..10140cc06 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -104,11 +104,11 @@ PreserveNewest - runtimes/win-x64/native/cuda11/ggml.dll + runtimes/win-x64/native/cuda12/ggml.dll PreserveNewest - runtimes/win-x64/native/cuda11/ggml-cuda.dll + runtimes/win-x64/native/cuda12/ggml-cuda.dll From 4950e0da66ae492ccdf263fada92c90aa78896ed Mon Sep 17 00:00:00 2001 From: m0nsky Date: Fri, 17 Jan 2025 23:34:12 +0100 Subject: [PATCH 10/22] Compatibility with llama.cpp backend split (PR #10256) on all platforms --- LLama/Native/Load/NativeLibraryUtils.cs | 90 +++++++++++++++++++------ 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index d5b014ce0..8227e8cc1 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -43,35 +43,81 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib foreach (var path in paths) { - Log($"Got relative library path '{path}' from local with {library.Metadata}, trying to load it...", LLamaLogLevel.Debug, config.LogCallback); + // After the llama.cpp binaries have been split up (PR #10256), we need to load the dependencies manually. + // It can't be done automatically on Windows, because the dependencies can be in different folders (for example, ggml-cuda.dll from the cuda12 folder, and ggml-cpu.dll from the avx2 folder) + // It can't be done automatically on Linux, because Linux uses the environment variable "LD_LIBRARY_PATH" to automatically load dependencies, and LD_LIBRARY_PATH can only be + // set before running LLamaSharp, but we only know which folders to search in when running LLamaSharp (decided by the NativeLibrary). - // If we are on Linux / OSX, we need to manually load the GGML dependency - if (systemInfo.OSPlatform == OSPlatform.Linux || systemInfo.OSPlatform == OSPlatform.OSX) + // Get the directory of the current runtime + string? currentRuntimeDirectory = Path.GetDirectoryName(path); + + // If we failed to get the directory of the current runtime, log it and continue on to the next library + if (currentRuntimeDirectory == null) { - // Get the directory of the library - string? libraryDirectory = Path.GetDirectoryName(path); - - if (libraryDirectory != null) + Log($"Failed to get the directory of the current runtime from path '{path}'", LLamaLogLevel.Error, config.LogCallback); + continue; + } + + // List which will hold all paths to dependencies to load + var dependencyPaths = new List(); + + // We should always load ggml-base from the current runtime directory + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-base{ext}")); + + // If the library has metadata, we can check if we need to load additional dependencies + if (library.Metadata != null) + { + if (systemInfo.OSPlatform == OSPlatform.OSX) { - // Construct the dependency (libggml) path - string dependencyPath = Path.Combine(libraryDirectory, $"{libPrefix}ggml{ext}"); - - // Try to load the dependency - var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback); + // // ggml-metal (uncomment if needed, requires testing) + // if (os == "osx-arm64") + // dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}")); + + // ggml-cpu + // On OSX, we should load the CPU backend from the current directory + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cpu{ext}")); + } + else + { + // ggml-cuda + if (library.Metadata.UseCuda) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); + + // ggml-vulkan + if (library.Metadata.UseVulkan) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + + // ggml-cpu + // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory + // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + } + } + + // And finally, we can add ggml + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml{ext}")); + + // Now, we will loop through our dependencyPaths and try to load them one by one + foreach (var dependencyPath in dependencyPaths) + { + // Try to load the dependency + var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback); - // If we successfully loaded the library, log it - if (dependencyResult != IntPtr.Zero) - { - Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); - } - else - { - Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); - } + // If we successfully loaded the library, log it + if (dependencyResult != IntPtr.Zero) + { + Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); + } + else + { + Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); } } - // Try to load the library + // Try to load the main library var result = TryLoad(path, description.SearchDirectories, config.LogCallback); // If we successfully loaded the library, return the handle From 40a8c6cf94c0cfcac24911f93d04cdc0a3222d95 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Fri, 17 Jan 2025 23:38:24 +0100 Subject: [PATCH 11/22] Restore original comment --- LLama/Native/Load/NativeLibraryUtils.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index 8227e8cc1..0414e1fde 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -43,6 +43,8 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib foreach (var path in paths) { + Log($"Got relative library path '{path}' from local with {library.Metadata}, trying to load it...", LLamaLogLevel.Debug, config.LogCallback); + // After the llama.cpp binaries have been split up (PR #10256), we need to load the dependencies manually. // It can't be done automatically on Windows, because the dependencies can be in different folders (for example, ggml-cuda.dll from the cuda12 folder, and ggml-cpu.dll from the avx2 folder) // It can't be done automatically on Linux, because Linux uses the environment variable "LD_LIBRARY_PATH" to automatically load dependencies, and LD_LIBRARY_PATH can only be From dc3dff15e9112dadd267647fc5251cf67e4cf09a Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 00:43:46 +0100 Subject: [PATCH 12/22] Update the dependency loader for ggml-metal and ggml-blas --- LLama/Native/Load/NativeLibraryUtils.cs | 29 +++++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index 0414e1fde..d0d853f63 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -71,16 +71,29 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib { if (systemInfo.OSPlatform == OSPlatform.OSX) { - // // ggml-metal (uncomment if needed, requires testing) - // if (os == "osx-arm64") - // dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}")); + // On OSX, we should load the CPU backend from the current directory // ggml-cpu - // On OSX, we should load the CPU backend from the current directory dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cpu{ext}")); + + // ggml-metal (only supported on osx-arm64) + if (os == "osx-arm64") + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}")); + + // ggml-blas (osx-x64, osx-x64-rosetta2 and osx-arm64 all have blas) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-blas{ext}")); } else { + // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory + // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us + + // ggml-cpu + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + // ggml-cuda if (library.Metadata.UseCuda) dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); @@ -88,14 +101,6 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib // ggml-vulkan if (library.Metadata.UseVulkan) dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); - - // ggml-cpu - // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory - // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us - dependencyPaths.Add(Path.Combine( - $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", - $"{libPrefix}ggml-cpu{ext}" - )); } } From 7b558ceb7d1182c7dcc834b1b2b1b262f49e9f38 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 00:47:54 +0100 Subject: [PATCH 13/22] Update the runtime targets for ggml-metal and ggml-blas --- LLama/LLamaSharp.Runtime.targets | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 10140cc06..2523e55c1 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -263,6 +263,14 @@ PreserveNewest runtimes/osx-arm64/native/libggml-cpu.dylib + + PreserveNewest + runtimes/osx-arm64/native/libggml-metal.dylib + + + PreserveNewest + runtimes/osx-arm64/native/libggml-blas.dylib + PreserveNewest runtimes/osx-arm64/native/libggml.dylib @@ -288,6 +296,10 @@ PreserveNewest runtimes/osx-x64/native/libggml-cpu.dylib + + PreserveNewest + runtimes/osx-x64/native/libggml-blas.dylib + PreserveNewest runtimes/osx-x64/native/libggml.dylib @@ -309,6 +321,10 @@ PreserveNewest runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-blas.dylib + PreserveNewest runtimes/osx-x64/native/rosetta2/libggml.dylib From 6d0b42122e655cc9d393c10e971adb8017c1035d Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 00:54:18 +0100 Subject: [PATCH 14/22] Add CPU backend (fallback) dependency for the GPU backends --- LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec | 4 ++++ LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec | 4 ++++ LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec | 4 ++++ LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec | 4 ++++ LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec | 4 ++++ LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec index 7b4f959f4..db7e1c139 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec index 34bc6781d..72d7c5774 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec index 8834ae413..643ac1633 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec index 3d37accec..8117cf04f 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec index 725764097..77ae83324 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec index 5c5b83f94..55ee0784c 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec @@ -12,6 +12,10 @@ Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + From 4dbdc822519790cec562fcdb90d9a8edc8cc4835 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 00:55:22 +0100 Subject: [PATCH 15/22] Fix icons for the nuget backends --- LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec | 1 + LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec | 1 + LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec | 1 + LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec | 1 + LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec | 1 + LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec | 1 + 6 files changed, 6 insertions(+) diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec index db7e1c139..c158e9459 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support. diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec index 72d7c5774..0eaac8d04 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support. diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec index 643ac1633..87c58d8a4 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda12.Linux contains the Linux binaries for LLamaSharp with Cuda12 support. diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec index 8117cf04f..4ad3ec096 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda12.Windows contains the Windows binaries for LLamaSharp with Cuda12 support. diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec index 77ae83324..ce3d74f5b 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Vulkan.Linux contains the Linux binaries for LLamaSharp with Vulkan support. diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec index 55ee0784c..f353ce105 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec @@ -7,6 +7,7 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Vulkan.Windows contains the Windows binaries for LLamaSharp with Vulkan support. From 556a7c153ff9e884b584f663cd95cf4ae5a193c0 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 01:03:59 +0100 Subject: [PATCH 16/22] Update nuspec files for the GPU backends --- .../runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec | 6 +++++- .../build/LLamaSharp.Backend.Cuda11.Windows.nuspec | 6 +++++- LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec | 1 + .../runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec | 7 +++++-- .../build/LLamaSharp.Backend.Cuda12.Windows.nuspec | 7 +++++-- .../runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec | 6 +++++- .../build/LLamaSharp.Backend.Vulkan.Windows.nuspec | 7 ++++++- LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec | 3 ++- 8 files changed, 34 insertions(+), 9 deletions(-) diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec index c158e9459..6abd16ccc 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec @@ -21,9 +21,13 @@ - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec index 0eaac8d04..a412e2e6f 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec @@ -21,9 +21,13 @@ - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec index 1beeeaafc..5ac473914 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec @@ -22,6 +22,7 @@ + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec index 87c58d8a4..687283221 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec @@ -21,10 +21,13 @@ - - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec index 4ad3ec096..1fd01edb9 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec @@ -21,10 +21,13 @@ - - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec index ce3d74f5b..3f2202db4 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec @@ -21,9 +21,13 @@ - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec index f353ce105..3f7487bcd 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec @@ -21,9 +21,14 @@ - + + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec index b4f26ec97..c972ad0fc 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec @@ -22,6 +22,7 @@ + - + \ No newline at end of file From f526cbed04c9ddfb171a7c5e2d354246351f3897 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 01:04:32 +0100 Subject: [PATCH 17/22] Update BinaryReleaseId --- LLama/LLamaSharp.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 4d0d19eb3..784f77221 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - 0827b2c1da-v2 + 0827b2c1da-v5 From 91effe9d24f3aaa48e9bdf0c6920d97808ff0ef4 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 01:13:21 +0100 Subject: [PATCH 18/22] Update nuspec for CPU & OSX --- .../build/LLamaSharp.Backend.Cpu.nuspec | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index ab8d11c04..382eb2ae8 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -18,53 +18,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + - - - - - - - - - - + From 3be20b1ecf5d3628cab9b36ced0f5d66e728862e Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 21:44:06 +0100 Subject: [PATCH 19/22] Update CPU nuspec to use noavx folder --- .../build/LLamaSharp.Backend.Cpu.nuspec | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 382eb2ae8..debc99506 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -18,11 +18,11 @@ - - - - - + + + + + @@ -42,11 +42,11 @@ - - - - - + + + + + From 686627c691f2b2cc178b8fd8e487948b869b1e77 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Sun, 19 Jan 2025 21:45:32 +0100 Subject: [PATCH 20/22] Update Runtime.targets to use noavx folder --- LLama/LLamaSharp.Runtime.targets | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 2523e55c1..76292aaf5 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -4,19 +4,19 @@ - + PreserveNewest runtimes/win-x64/native/noavx/llama.dll - + PreserveNewest runtimes/win-x64/native/noavx/ggml.dll - + PreserveNewest runtimes/win-x64/native/noavx/ggml-base.dll - + PreserveNewest runtimes/win-x64/native/noavx/ggml-cpu.dll @@ -130,19 +130,19 @@ - + PreserveNewest runtimes/linux-x64/native/noavx/libllama.so - + PreserveNewest runtimes/linux-x64/native/noavx/libggml.so - + PreserveNewest runtimes/linux-x64/native/noavx/libggml-base.so - + PreserveNewest runtimes/linux-x64/native/noavx/libggml-cpu.so @@ -338,7 +338,7 @@ runtimes/osx-x64/native/rosetta2/libllava_shared.dylib - + PreserveNewest runtimes/win-x64/native/noavx/llava_shared.dll @@ -367,7 +367,7 @@ runtimes/win-x64/native/vulkan/llava_shared.dll - + PreserveNewest runtimes/linux-x64/native/noavx/libllava_shared.so From 1913966d2a967dbe8385f5cfef06311c74ffd43e Mon Sep 17 00:00:00 2001 From: m0nsky Date: Mon, 20 Jan 2025 00:44:51 +0100 Subject: [PATCH 21/22] Update BinaryReleaseId --- LLama/LLamaSharp.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 784f77221..b2d81711e 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - 0827b2c1da-v5 + 0827b2c1da-v6 From 014ef7844b1b455f6f877a788828e2a2ec9bbeb8 Mon Sep 17 00:00:00 2001 From: m0nsky Date: Mon, 20 Jan 2025 01:48:57 +0100 Subject: [PATCH 22/22] CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level --- LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs | 4 ++-- LLama/Native/Load/NativeLibraryUtils.cs | 2 +- LLama/Native/Load/NativeLibraryWithCuda.cs | 4 +++- LLama/Native/Load/NativeLibraryWithVulkan.cs | 4 +++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs index 497902ba1..36ab0c0c8 100644 --- a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs +++ b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs @@ -22,12 +22,12 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip { if (description.UseCuda) { - yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.SkipCheck); + yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.AvxLevel, description.SkipCheck); } if (description.UseVulkan) { - yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.SkipCheck); + yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.AvxLevel, description.SkipCheck); } if((!description.UseCuda || !description.UseVulkan) || description.AllowFallback) diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index d0d853f63..13e68be4d 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -93,7 +93,7 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", $"{libPrefix}ggml-cpu{ext}" )); - + // ggml-cuda if (library.Metadata.UseCuda) dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); diff --git a/LLama/Native/Load/NativeLibraryWithCuda.cs b/LLama/Native/Load/NativeLibraryWithCuda.cs index 12da095dc..36dc4ca81 100644 --- a/LLama/Native/Load/NativeLibraryWithCuda.cs +++ b/LLama/Native/Load/NativeLibraryWithCuda.cs @@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata ///
/// /// + /// /// - public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck) { _majorCudaVersion = majorCudaVersion; _libraryName = libraryName; + _avxLevel = avxLevel; _skipCheck = skipCheck; } diff --git a/LLama/Native/Load/NativeLibraryWithVulkan.cs b/LLama/Native/Load/NativeLibraryWithVulkan.cs index fe4eef01e..c3fe94de3 100644 --- a/LLama/Native/Load/NativeLibraryWithVulkan.cs +++ b/LLama/Native/Load/NativeLibraryWithVulkan.cs @@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata ///
/// /// + /// /// - public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck) { _vulkanVersion = vulkanVersion; _libraryName = libraryName; + _avxLevel = avxLevel; _skipCheck = skipCheck; }