Adds an option to apply the chat template to prompts when using StatelessExecutor.

phil-scott-78 · phil-scott-78 · commit d83f1a051628 · 2024-12-30T19:45:46.000-05:00
Also updates LLamaStatelessExecutor.cs to use the new functionality as an example.
diff --git a/LLama.Examples/Examples/StatelessModeExecute.cs b/LLama.Examples/Examples/StatelessModeExecute.cs
@@ -15,7 +15,11 @@ public static async Task Run()
                 GpuLayerCount = 5
             };
             using var model = await LLamaWeights.LoadFromFileAsync(parameters);
-            var ex = new StatelessExecutor(model, parameters);
+            var ex = new StatelessExecutor(model, parameters)
+            {
+                ApplyTemplate = true,
+                SystemMessage = "You are a helpful bot."
+            };
 
             Console.ForegroundColor = ConsoleColor.Yellow;
             Console.WriteLine("The executor has been enabled. In this example, the inference is an one-time job. That says, the previous input and response has " +
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -7,6 +7,7 @@
 using System.Threading;
 using LLama.Exceptions;
 using LLama.Native;
+using LLama.Transformers;
 using Microsoft.Extensions.Logging;
 
 namespace LLama
@@ -37,6 +38,17 @@ public class StatelessExecutor
         /// </summary>
         public LLamaContext Context { get; private set; }
 
+        /// <summary>
+        /// If true, applies the default template to the prompt as defined in the rules for <a href="https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template">llama_chat_apply_template</a> template.  
+        /// </summary>
+        public bool ApplyTemplate { get; init; }
+        
+        /// <summary>
+        /// The system message to use with the prompt. Only used when <see cref="ApplyTemplate" /> is true.
+        /// </summary>
+        public string? SystemMessage { get; init; }
+
+        
         /// <summary>
         /// Create a new stateless executor which will use the given model
         /// </summary>
@@ -79,6 +91,15 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
             var decoder = new StreamingTokenDecoder(Context);
             var antiprocessor = new AntipromptProcessor(inferenceParams.AntiPrompts);
 
+            if (ApplyTemplate)
+            {
+                var template = new LLamaTemplate(_weights.NativeHandle) { AddAssistant = true };
+                if (SystemMessage != null) template.Add("system", SystemMessage);
+
+                template.Add("user", prompt);
+                prompt = PromptTemplateTransformer.ToModelPrompt(template);
+            }
+            
             // Tokenize the prompt
             var tokens = Context.Tokenize(prompt, special: true).ToList();