Memory efficient context handling

zsogitbe · zsogitbe · commit 5f35b8ee2f83 · 2025-05-12T09:55:16.000+02:00
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -18,6 +18,8 @@ public sealed class LLamaSharpTextEmbeddingGenerator
         private readonly LLamaEmbedder _embedder;
         private readonly bool _ownsEmbedder;
 
+        private readonly ModelParams? @params;
+
         /// <inheritdoc/>
         public int MaxTokens { get; }
 
@@ -29,7 +31,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
         {
             MaxTokens = (int?)config.ContextSize ?? 2048;
 
-            var @params = new ModelParams(config.ModelPath)
+            @params = new ModelParams(config.ModelPath)
             {
                 ContextSize = config?.ContextSize ?? 2048,
                 GpuLayerCount = config?.GpuLayerCount ?? 20,
@@ -57,7 +59,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
         {
             MaxTokens = (int?)config.ContextSize ?? 2048;
 
-            var @params = new ModelParams(config.ModelPath)
+            @params = new ModelParams(config.ModelPath)
             {
                 ContextSize = config?.ContextSize ?? 2048,
                 GpuLayerCount = config?.GpuLayerCount ?? 20,
@@ -103,8 +105,12 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
             return new Embedding(embeddings.First());
         }
 
-        /// <inheritdoc/>
-        public int CountTokens(string text) => _embedder.CountTokens(text);
+        /// <summary>
+        /// Count tokens in the input text
+        /// </summary>
+        /// <param name="text">input text</param>
+        /// <returns></returns>
+        public int CountTokens(string text) => _weights?.CountTokens(text, @params!) ?? 0;
 
         /// <summary>
         /// Get the list of tokens for the input text
@@ -114,6 +120,6 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
         /// <remarks>
         /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
         /// <see cref="CountTokens(string)"/>
-        public IReadOnlyList<string> GetTokens(string text) => _embedder.GetTokens(text);
+        public IReadOnlyList<string> GetTokens(string text) => _weights?.GetTokens(text, @params!) ?? new List<string>();
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -19,6 +19,8 @@ public sealed class LlamaSharpTextGenerator
 
         private readonly InferenceParams? _defaultInferenceParams;
 
+        private readonly ModelParams? @params;
+
         public int MaxTokenTotal { get; }
 
         /// <summary>
@@ -27,7 +29,7 @@ public sealed class LlamaSharpTextGenerator
         /// <param name="config">The configuration for LLamaSharp.</param>
         public LlamaSharpTextGenerator(LLamaSharpConfig config)
         {
-            var parameters = new ModelParams(config.ModelPath)
+            @params = new ModelParams(config.ModelPath)
             {
                 ContextSize = config?.ContextSize ?? 2048,
                 GpuLayerCount = config?.GpuLayerCount ?? 20,
@@ -38,11 +40,11 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
                 FlashAttention = true,
                 UseMemorymap = true
             };
-            _weights = LLamaWeights.LoadFromFile(parameters);
-            _executor = new StatelessExecutor(_weights, parameters);
+            _weights = LLamaWeights.LoadFromFile(@params);
+            _executor = new StatelessExecutor(_weights, @params);
             _defaultInferenceParams = config!.DefaultInferenceParams;
             _ownsWeights = true;
-            MaxTokenTotal = (int)parameters.ContextSize;
+            MaxTokenTotal = (int)@params.ContextSize;
         }
 
         /// <summary>
@@ -55,7 +57,7 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
         {
             InferenceParams? inferenceParams = config.DefaultInferenceParams;
             _weights = weights;
-            var parameters = new ModelParams("")
+            @params = new ModelParams("")
             {
                 ContextSize = config?.ContextSize ?? 2048,
                 GpuLayerCount = config?.GpuLayerCount ?? 20,
@@ -66,9 +68,9 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
                 FlashAttention = true,
                 UseMemorymap = true
             };
-            _executor = executor ?? new StatelessExecutor(_weights, parameters);
+            _executor = executor ?? new StatelessExecutor(_weights, @params);
             _defaultInferenceParams = inferenceParams;
-            MaxTokenTotal = (int)parameters.ContextSize;
+            MaxTokenTotal = (int)@params.ContextSize;
         }
 
         /// <inheritdoc/>
@@ -122,8 +124,12 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
             };
         }
 
-        /// <inheritdoc/>
-        public int CountTokens(string text) => _executor.CountTokens(text);
+        /// <summary>
+        /// Count tokens in the input text
+        /// </summary>
+        /// <param name="text">input text</param>
+        /// <returns></returns>
+        public int CountTokens(string text) => _weights?.CountTokens(text, @params!) ?? 0;
 
         /// <summary>
         /// Get the list of tokens for the input text
@@ -133,7 +139,6 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
         /// <remarks>
         /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
         /// <see cref="CountTokens(string)"/>
-        public IReadOnlyList<string> GetTokens(string text) => _executor.GetTokens(text);
-
+        public IReadOnlyList<string> GetTokens(string text) => _weights?.GetTokens(text, @params!) ?? new List<string>();
     }
 }
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
@@ -154,50 +154,4 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
 
         return (results, tokens.Length);
     }
-
-    /// <summary>
-    /// 
-    /// </summary>
-    /// <param name="text"></param>
-    /// <returns></returns>
-    public int CountTokens(string text)
-    {
-        // Ensure the context from last time is disposed (it always should be)
-        if (!Context.NativeHandle.IsClosed)
-            Context.Dispose();
-        Context = _weights.CreateContext(_params, _logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
-        int count = Context.Tokenize(text, special: true).Length;
-        Context.Dispose();
-
-        return count;
-    }
-
-    /// <summary>
-    /// Get the list of tokens for the input text
-    /// </summary>
-    /// <param name="text">Input string to be tokenized</param>
-    /// <returns>Read-only list of tokens for the input test</returns>
-    /// <remarks>
-    /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
-    /// <see cref="CountTokens(string)"/>
-    public IReadOnlyList<string> GetTokens(string text)
-    {
-        // Ensure the context from last time is disposed (it always should be)
-        if (!Context.NativeHandle.IsClosed)
-            Context.Dispose();
-        Context = _weights.CreateContext(_params, _logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
-
-        /* see relevant unit tests for important implementation notes regarding unicode */
-        var context = Context;
-        var numericTokens = context.Tokenize(text, special: true);
-        var decoder = new StreamingTokenDecoder(context);
-        var tokens = numericTokens
-            .Select(x => { decoder.Add(x); return decoder.Read(); })
-            .ToList();
-        Context.Dispose();
-
-        return tokens;
-    }
 }
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -169,44 +169,5 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     throw new LLamaDecodeError(returnCode);
             }
         }
-
-        /// <inheritdoc/>
-        public int CountTokens(string text)
-        {
-            // Ensure the context from last time is disposed (it always should be)
-            if (!Context.NativeHandle.IsClosed)
-                Context.Dispose();
-            Context = _weights.CreateContext(_params, _logger);
-            int count = Context.Tokenize(text, special: true).Length;
-            Context.Dispose();
-
-            return count;
-        }
-
-        /// <summary>
-        /// Get the list of tokens for the input text
-        /// </summary>
-        /// <param name="text">Input string to be tokenized</param>
-        /// <returns>Read-only list of tokens for the input test</returns>
-        /// <remarks>
-        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
-        /// <see cref="CountTokens(string)"/>
-        public IReadOnlyList<string> GetTokens(string text)
-        {
-            // Ensure the context from last time is disposed (it always should be)
-            if (!Context.NativeHandle.IsClosed)
-                Context.Dispose();
-            Context = _weights.CreateContext(_params, _logger);
-
-            /* see relevant unit tests for important implementation notes regarding unicode */
-            var numericTokens = Context.Tokenize(text, special: true);
-            var decoder = new StreamingTokenDecoder(Context);
-            var tokens = numericTokens
-                .Select(x => { decoder.Add(x); return decoder.Read(); })
-                .ToList();
-            Context.Dispose();
-
-            return tokens ?? new List<string>();
-        }
     }
 }
diff --git a/LLama/LLamaWeights.cs b/LLama/LLamaWeights.cs
@@ -1,5 +1,6 @@
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
@@ -165,5 +166,35 @@ public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding e
         {
             return NativeHandle.Tokenize(text, add_bos, special, encoding);
         }
+
+        /// <summary>
+        /// Count the tokens in the input text
+        /// </summary>
+        /// <param name="text">input text</param>
+        /// <param name="parameters">context parameters</param>
+        /// <returns></returns>
+        public int CountTokens(string text, IContextParams parameters)
+        {
+            using var context = CreateContext(parameters);
+            var count = context.Tokenize(text, special: true).Length;
+            return count;
+        }
+
+        /// <summary>
+        /// Get the list of tokens for the input text
+        /// </summary>
+        /// <param name="text">Input string to be tokenized</param>
+        /// <param name="parameters">Context parameters</param>
+        /// <returns>Read-only list of tokens for the input test</returns>
+        /// <remarks>
+        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
+        /// <see cref="CountTokens(string, IContextParams)"/>
+        public IReadOnlyList<string> GetTokens(string text, IContextParams parameters)
+        {
+            using var context = CreateContext(parameters);
+            var numericTokens = context.Tokenize(text, special: true);
+            var decoder = new StreamingTokenDecoder(context);
+            return numericTokens.Select(x => { decoder.Add(x); return decoder.Read(); }).ToList();
+        }
     }
 }