Almost completed openairag query executor

alkampfergit · alkampfergit · commit d79948ee29b3 · 2024-07-25T15:06:31.000+02:00
diff --git a/src/KernelMemory.Extensions.ConsoleTest/Samples/CustomSearchPipelineBase.cs b/src/KernelMemory.Extensions.ConsoleTest/Samples/CustomSearchPipelineBase.cs
@@ -80,7 +80,10 @@ public async Task RunSample2()
 
         var queryExecutorToUse = AnsiConsole.Prompt(new SelectionPrompt<string>()
             .Title("Select the query executor to use")
-            .AddChoices(["KernelMemory Default", "Cohere CommandR+"]));
+            .AddChoices([
+                "KernelMemory Default", 
+                "Cohere CommandR+",
+                "OpenAI Tool"]));
 
         var queryRewriterTool = AnsiConsole.Prompt(new SelectionPrompt<string>()
             .Title("Select query rewriter")
@@ -90,7 +93,7 @@ public async Task RunSample2()
         var builder = CreateBasicKernelMemoryBuilder(
             services,
             storageToUse == "elasticsearch",
-            queryExecutorToUse == "Cohere CommandR+",
+            queryExecutorToUse,
             queryRewriterTool == "Semantic Kernel Handlebar");
         var kernelMemory = builder.Build<MemoryServerless>();
         var kernel = kernelBuider.Build();
@@ -136,6 +139,7 @@ public async Task RunSample2()
                 var questionEnumerator = questionPipeline.ExecuteQueryAsync(userQuestion);
 
                 Console.WriteLine("\nAnswerStream:\n");
+                int segments = 0;
                 await foreach (var step in questionEnumerator)
                 {
                     if (shouldDumpRewrittenQuery)
@@ -146,9 +150,16 @@ public async Task RunSample2()
                     if (step.Type == UserQuestionProgressType.AnswerPart)
                     {
                         Console.Write(step.Text);
+                        segments++;
                     }
                 }
 
+                //ok we really have streaming result?^
+                if (segments == 0)
+                {
+                    //ok we have no streaming, so we need to get the whole answer.
+                    Console.Write(userQuestion.Answer);
+                }
                 Console.WriteLine("\n\n");
 
                 //ok we can validate the answer if requested
@@ -176,7 +187,7 @@ public async Task RunSample2()
 
     private static async Task ManageIndexingOfDocuments(MemoryServerless kernelMemory)
     {
-        var indexDocument = AnsiConsole.Confirm("Do you want to index documents? (y/n)", true);
+        var indexDocument = AnsiConsole.Confirm("Do you want to index documents? (y/n)", false);
         if (indexDocument)
         {
             var singleDocumentIdex = AnsiConsole.Confirm("Do you want to index a single document? (y/n)", true);
@@ -226,7 +237,7 @@ private static async Task IndexDocument(MemoryServerless kernelMemory, string do
     private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
         ServiceCollection services,
         bool useElasticSearch,
-        bool useCohereCommandRPlusForQueryExecutor,
+        string ragToolToUse,
         bool useHandlebarQueryRewriter)
     {
         // we need a series of services to use Kernel Memory, the first one is
@@ -305,6 +316,17 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
         services.AddSingleton<CohereCommandRQueryExecutor>();
         services.AddSingleton<StandardRagQueryExecutor>();
 
+        //register openai RAG component
+        var openaiRagQueryExecutorConfiguration = new OpenAIRagQueryExecutorConfiguration()
+        {
+            MaxTokens = 8000,
+            Temperature = 0.0,
+            ModelId = "gpt4o",
+            ModelName = "gpt-4o" //important it will determine the tokenizer
+        };
+        services.AddSingleton(openaiRagQueryExecutorConfiguration);
+        services.AddSingleton<OpenaiRagQueryExecutor>();
+
         //now create the pipeline
         services.AddKernelMemoryUserQuestionPipeline(config =>
         {
@@ -315,10 +337,14 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
                 config.AddHandler<KeywordSearchQueryHandler>();
             }
 
-            if (useCohereCommandRPlusForQueryExecutor)
+            if (ragToolToUse == "Cohere CommandR+")
             {
                 config.AddHandler<CohereCommandRQueryExecutor>();
             }
+            else if (ragToolToUse == "OpenAI Tool")
+            {
+                config.AddHandler<OpenaiRagQueryExecutor>();
+            }
             else
             {
                 config.AddHandler<StandardRagQueryExecutor>();
diff --git a/src/KernelMemory.Extensions/QueryPipeline/MemoryRecordHelper.cs b/src/KernelMemory.Extensions/QueryPipeline/MemoryRecordHelper.cs
@@ -8,7 +8,7 @@ namespace KernelMemory.Extensions.QueryPipeline
 {
     internal static class MemoryRecordHelper
     {
-        internal static IReadOnlyCollection<Citation>? BuildCitations(
+        internal static IReadOnlyCollection<Citation> BuildCitations(
             List<MemoryRecord> usedMemoryRecord,
             string index,
             ILogger logger)
diff --git a/src/KernelMemory.Extensions/QueryPipeline/OpenaiRagQueryExecutor.cs b/src/KernelMemory.Extensions/QueryPipeline/OpenaiRagQueryExecutor.cs
@@ -0,0 +1,255 @@
+﻿using Microsoft.Extensions.Logging;
+using Microsoft.KernelMemory.Diagnostics;
+using Microsoft.KernelMemory.MemoryStorage;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.OpenAI;
+using System;
+using System.Collections.Generic;
+using System.ComponentModel;
+using System.Diagnostics;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace KernelMemory.Extensions.QueryPipeline;
+
+public class OpenAIRagQueryExecutorConfiguration
+{
+    /// <summary>
+    /// This is the name of the OpenAI model used to 
+    /// create the correct tokenizer. If you are using
+    /// the default <see cref="ModelId"/> automatically
+    /// we will use a standard gpt3.5 model
+    /// </summary>
+    public string ModelName { get; set; } = "gpt35";
+
+    /// <summary>
+    /// This is the modelId configured in Semantic Kernel 
+    /// that helps using the correct model. If it is null
+    /// we will use the default model
+    /// </summary>
+    public string? ModelId { get; set; }
+
+    /// <summary>
+    /// Max tokens in the request, if not specified 3000 tokens
+    /// will be used
+    /// </summary>
+    public int MaxTokens { get; set; } = 3000;
+
+    /// <summary>
+    /// Temperature of the request.
+    /// </summary>
+    public double Temperature { get; set; }
+
+    /// <summary>
+    /// If GPT returns no citation we can remove the answer.
+    /// </summary>
+    public bool RemoveAnswerIfNoCitations { get; set; } = false;
+}
+
+/// <summary>
+/// Executes the query part, it will start with a predefined prompt
+/// and then add in the prompt all the retrieved memories as fact and
+/// then use the LLM to answer user query with the fact (grounding).
+/// </summary>
+public class OpenaiRagQueryExecutor : BasicQueryHandler
+{
+    public override string Name => "OpenaiRagQueryExecutor";
+
+    private readonly Kernel _kernel;
+    private readonly OpenAIRagQueryExecutorConfiguration _config;
+    private readonly Tokenizer _tokenizer;
+    private readonly ILogger<StandardRagQueryExecutor> _log;
+
+    public OpenaiRagQueryExecutor(
+        Kernel kernel,
+        OpenAIRagQueryExecutorConfiguration? config = null,
+        ILogger<StandardRagQueryExecutor>? log = null)
+    {
+        _kernel = kernel;
+        _config = config ?? new OpenAIRagQueryExecutorConfiguration();
+        _tokenizer = Tiktoken.CreateTiktokenForModel(_config.ModelName);
+        _log = log ?? DefaultLogger<StandardRagQueryExecutor>.Instance;
+    }
+
+    protected override async Task OnHandleAsync(
+        UserQuestion userQuestion,
+        CancellationToken cancellationToken)
+    {
+        var memoryRecords = await userQuestion.GetMemoryOrdered();
+        if (memoryRecords.Count == 0)
+        {
+            //Well we have no memory we can simply return. 
+            return;
+        }
+
+        //This code is taken and modified from the original KernelMemory codebase
+        //Create a base of facts in a stringbuilder.
+        var facts = new StringBuilder();
+
+        //Then we need to stop adding facts when we reach the max number of tokens
+        var tokensAvailable = _config.MaxTokens - _tokenizer.CountTokens(userQuestion.Question);
+
+        //TODO: Add the preambole of the prompt to token count
+
+        //Some statistics to tell how many facts we have available and how many we used.
+        int factsAvailableCount = memoryRecords.Count;
+        int factsUsedCount = 0;
+
+        //we need to get the list of all memory record used, because we will need them 
+        //to build citations.
+        List<MemoryRecord> memoryRecordToUse = new();
+        int docNumber = 0;
+        foreach (var mr in memoryRecords)
+        {
+            factsAvailableCount++;
+            var partitionText = mr.GetPartitionText();
+
+            var size = _tokenizer.CountTokens(partitionText);
+            if (size >= tokensAvailable)
+            {
+                // Stop after reaching the max number of tokens
+                break;
+            }
+
+            factsUsedCount++;
+
+            //Create a special format for the fact.
+            var fact = $"---\nDocument {++docNumber}:\n{partitionText}\n";
+
+            facts.Append(fact);
+            memoryRecordToUse.Add(mr);
+            tokensAvailable -= size;
+        }
+
+        if (factsAvailableCount > 0 && factsUsedCount == 0)
+        {
+            _log.LogError("Unable to inject memories in the prompt, not enough tokens available");
+            return;
+        }
+
+        if (factsUsedCount == 0)
+        {
+            _log.LogWarning("No memories available");
+            return;
+        }
+        var watch = new Stopwatch();
+        watch.Restart();
+
+        var openaiAnswer = await GenerateAnswerAsync(userQuestion.Question, facts.ToString(), cancellationToken);
+
+        if (openaiAnswer == null)
+        {
+            //now answer is possible, then we can let the question flow to another handler.
+            return;
+        }
+
+        //ok now we want to add answer and citations.
+        watch.Stop();
+        _log.LogTrace("Answer generated in {0} msecs", watch.ElapsedMilliseconds);
+
+        userQuestion.Answer = openaiAnswer.Answer;
+
+        List<MemoryRecord> usedMemoryRecord = memoryRecordToUse
+            .Where((_, i) => openaiAnswer.Documents.Contains(i))
+            .ToList();
+
+        // now we need to clean up the citations, including only the one used to answer the question
+        userQuestion.Citations = MemoryRecordHelper.BuildCitations(usedMemoryRecord, userQuestion.UserQueryOptions.Index, this._log);
+
+        // ground if needed
+        if (_config.RemoveAnswerIfNoCitations && userQuestion.Citations!.Count == 0)
+        {
+            //no answer is possible, because we do not have citations.
+            userQuestion.Answer = null;
+        }
+    }
+
+    private class GptAnswer
+    {
+        public string Answer { get; set; }
+
+        public HashSet<int> Documents { get; set; }
+    }
+
+    /// <summary>
+    /// Perform the call to the OpenAI API to get the answer to the question,
+    /// it uses tool call to coerce the answer to not only return the answer text
+    /// but also the array of documents that are used to answer the query
+    /// </summary>
+    /// <param name="question"></param>
+    /// <param name="documents"></param>
+    /// <returns></returns>
+    private async Task<GptAnswer?> GenerateAnswerAsync(
+        string question,
+        string documents,
+        CancellationToken token)
+    {
+        //First step is creating the function in Semantic Kernel.
+        var function = KernelFunctionFactory.CreateFromMethod(
+            [Description("Return the result to the user")] (
+            [Description("Answer of the question")] string answer,
+            [Description("Documents used to formulate the answer")] int[] documents
+        ) =>
+        {
+        }, "return_result");
+        var plugin = KernelPluginFactory.CreateFromFunctions("MyPlugin", [function]);
+        var openAIFunction = plugin.GetFunctionsMetadata().First().ToOpenAIFunction();
+
+        // Create a template for chat with settings
+        var chat = _kernel.CreateFunctionFromPrompt(new PromptTemplateConfig()
+        {
+            Name = "Rag",
+            Description = "Answer user question with documents.",
+            Template = @"You are an AI assistant that helps users answer questions given a specific context. You will be given a context and asked a question based on that context. Your answer should be as precise as possible and should only come from the context.
+Please add all documents used as citations.
+Question: {{$question}}
+
+Documents:
+{{$documents}}",
+            TemplateFormat = "semantic-kernel",
+            InputVariables =
+            [
+                new() { Name = "question", Description = "Question of the user.", IsRequired = true },
+                new() { Name = "documents", Description = "Documents needed to answer the query.", IsRequired = true }
+            ],
+            ExecutionSettings =
+            {
+                { "default", new OpenAIPromptExecutionSettings()
+                    {
+                        MaxTokens = 1000,
+                        Temperature = 0,
+                        ModelId = _config.ModelId,
+                        ChatSystemPrompt = "You will answer question of the user using only documents in the prompt",
+                        ToolCallBehavior = ToolCallBehavior.RequireFunction(openAIFunction, false),
+                    }
+                },
+            }
+        });
+
+        KernelArguments ka = new();
+        ka["question"] = question;
+        ka["documents"] = documents;
+        var result = await _kernel.InvokeAsync(chat, ka, token);
+
+        var openaiMessageContent = result.GetValue<OpenAIChatMessageContent>();
+        if (result is FunctionResult fre)
+        {
+            var toolCall = openaiMessageContent.GetOpenAIFunctionToolCalls().Single();
+            var answer = ((JsonElement)toolCall.Arguments["answer"]).GetString()!;
+
+            //-1 is because GPT is 1 based with document.
+            var citations = ((JsonElement)toolCall.Arguments["documents"]).EnumerateArray().Select(e => e.GetInt32() - 1).ToHashSet();
+            return new GptAnswer()
+            {
+                Answer = answer,
+                Documents = citations
+            };
+        }
+
+        return null;
+    }
+}
diff --git a/src/KernelMemory.Extensions/QueryPipeline/StandardRagQueryExecutor.cs b/src/KernelMemory.Extensions/QueryPipeline/StandardRagQueryExecutor.cs
@@ -16,6 +16,11 @@
 
 namespace KernelMemory.Extensions
 {
+    /// <summary>
+    /// Executes the query part, it will start with a predefined prompt
+    /// and then add in the prompt all the retrieved memories as fact and
+    /// then use the LLM to answer user query with the fact (grounding).
+    /// </summary>
     public class StandardRagQueryExecutor : BasicAsyncQueryHandlerWithProgress
     {
         public override string Name => "StandardRagQueryExecutor";

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ namespace KernelMemory.Extensions.QueryPipeline`
`8`	`8`	`{`
`9`	`9`	`internal static class MemoryRecordHelper`
`10`	`10`	`{`
`11`		`- internal static IReadOnlyCollection<Citation>? BuildCitations(`
	`11`	`+ internal static IReadOnlyCollection<Citation> BuildCitations(`
`12`	`12`	`List<MemoryRecord> usedMemoryRecord,`
`13`	`13`	`string index,`
`14`	`14`	`ILogger logger)`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,11 @@`
`16`	`16`
`17`	`17`	`namespace KernelMemory.Extensions`
`18`	`18`	`{`
	`19`	`+ /// <summary>`
	`20`	`+ /// Executes the query part, it will start with a predefined prompt`
	`21`	`+ /// and then add in the prompt all the retrieved memories as fact and`
	`22`	`+ /// then use the LLM to answer user query with the fact (grounding).`
	`23`	`+ /// </summary>`
`19`	`24`	`public class StandardRagQueryExecutor : BasicAsyncQueryHandlerWithProgress`
`20`	`25`	`{`
`21`	`26`	`public override string Name => "StandardRagQueryExecutor";`