Migration Completion api to chat completion api (#419)

srbalakr · web-flow · commit 6bfb2ccfdb0c · 2023-07-17T12:58:52.000-07:00
Port completion api to chatCompletion api
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -56,13 +56,13 @@
 # Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns
 # or some derivative, here we include several for exploration purposes
 ask_approaches = {
-    "rtr": RetrieveThenReadApproach(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),
+    "rtr": RetrieveThenReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),
     "rrr": ReadRetrieveReadApproach(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),
     "rda": ReadDecomposeAsk(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)
 }
 
 chat_approaches = {
-    "rrr": ChatReadRetrieveReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)
+    "rrr": ChatReadRetrieveReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)
 }
 
 app = Flask(__name__)
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
@@ -7,6 +7,9 @@
 from approaches.approach import Approach
 from text import nonewlines
 
+from core.messagebuilder import MessageBuilder
+from core.modelhelper import get_token_limit
+
 class ChatReadRetrieveReadApproach(Approach):
     # Chat roles
     SYSTEM = "system"
@@ -34,41 +37,53 @@ class ChatReadRetrieveReadApproach(Approach):
     Generate a search query based on the conversation and the new question. 
     Do not include cited source filenames and document names e.g info.txt or doc.pdf in the search query terms.
     Do not include any text inside [] or <<>> in the search query terms.
+    Do not include any special characters like '+'.
     If the question is not in English, translate the question to English before generating the search query.
 
-Chat History:
-{chat_history}
-
-Question:
-{question}
-
-Search query:
+Search Query:
 """
-
-    def __init__(self, search_client: SearchClient, chatgpt_deployment: str, chatgpt_model: str, gpt_deployment: str, sourcepage_field: str, content_field: str):
+    query_prompt_few_shots = [
+        {'role' : USER, 'content' : 'What are my health plans?' },
+        {'role' : ASSISTANT, 'content' : 'Show available health plans' },
+        {'role' : USER, 'content' : 'does my plan cover cardio?' },
+        {'role' : ASSISTANT, 'content' : 'Health plan cardio coverage' }
+    ]
+
+    def __init__(self, search_client: SearchClient, chatgpt_deployment: str, chatgpt_model: str, sourcepage_field: str, content_field: str):
         self.search_client = search_client
         self.chatgpt_deployment = chatgpt_deployment
         self.chatgpt_model = chatgpt_model
-        self.gpt_deployment = gpt_deployment
         self.sourcepage_field = sourcepage_field
         self.content_field = content_field
+        self.chatgpt_token_limit = get_token_limit(chatgpt_model)
 
     def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]) -> Any:
         use_semantic_captions = True if overrides.get("semantic_captions") else False
         top = overrides.get("top") or 3
         exclude_category = overrides.get("exclude_category") or None
         filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
 
+        user_q = 'Generate search query for: ' + history[-1]["user"]
+
         # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
-        prompt = self.query_prompt_template.format(chat_history=self.get_chat_history_as_text(history, include_last_turn=False), question=history[-1]["user"])
-        completion = openai.Completion.create(
-            engine=self.gpt_deployment, 
-            prompt=prompt, 
+        messages = self.get_messages_from_history(
+            self.query_prompt_template,
+            self.chatgpt_model,
+            history,
+            user_q,
+            self.query_prompt_few_shots,
+            self.chatgpt_token_limit - len(user_q)
+            )
+
+        chat_completion = openai.ChatCompletion.create(
+            deployment_id=self.chatgpt_deployment,
+            model=self.chatgpt_model,
+            messages=messages, 
             temperature=0.0, 
             max_tokens=32, 
-            n=1, 
-            stop=["\n"])
-        q = completion.choices[0].text
+            n=1)
+        
+        q = chat_completion.choices[0].message.content
 
         # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
         if overrides.get("semantic_ranker"):
@@ -90,94 +105,59 @@ def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]) -> A
 
         follow_up_questions_prompt = self.follow_up_questions_prompt_content if overrides.get("suggest_followup_questions") else ""
         
+        # STEP 3: Generate a contextual and content specific answer using the search results and chat history
+
         # Allow client to replace the entire prompt, or to inject into the exiting prompt using >>>
-        prompt_override = overrides.get("prompt_template")
-        messages = self.get_messages_from_history(prompt_override=prompt_override, follow_up_questions_prompt=follow_up_questions_prompt,history=history, sources=content)
+        prompt_override = overrides.get("prompt_override")
+        if prompt_override is None:
+            system_message = self.system_message_chat_conversation.format(injected_prompt="", follow_up_questions_prompt=follow_up_questions_prompt)
+        elif prompt_override.startswith(">>>"):
+            system_message = self.system_message_chat_conversation.format(injected_prompt=prompt_override[3:] + "\n", follow_up_questions_prompt=follow_up_questions_prompt)
+        else:
+            system_message = prompt_override.format(follow_up_questions_prompt=follow_up_questions_prompt)
+        
+        # latest conversation
+        user_content = history[-1]["user"] + " \nSources:" + content
+
+        messages = self.get_messages_from_history(
+            system_message,
+            self.chatgpt_model,
+            history,
+            user_content,
+            max_tokens=self.chatgpt_token_limit)
 
-        # STEP 3: Generate a contextual and content specific answer using the search results and chat history
         chat_completion = openai.ChatCompletion.create(
             deployment_id=self.chatgpt_deployment,
             model=self.chatgpt_model,
             messages=messages, 
             temperature=overrides.get("temperature") or 0.7, 
             max_tokens=1024, 
             n=1)
-        
+
         chat_content = chat_completion.choices[0].message.content
 
         msg_to_display = '\n\n'.join([str(message) for message in messages])
 
         return {"data_points": results, "answer": chat_content, "thoughts": f"Searched for:<br>{q}<br><br>Conversations:<br>" + msg_to_display.replace('\n', '<br>')}
     
-    def get_chat_history_as_text(self, history: Sequence[dict[str, str]], include_last_turn: bool=True, approx_max_tokens: int=1000) -> str:
-        history_text = ""
-        for h in reversed(history if include_last_turn else history[:-1]):
-            history_text = """<|im_start|>user""" + "\n" + h["user"] + "\n" + """<|im_end|>""" + "\n" + """<|im_start|>assistant""" + "\n" + (h.get("bot", "") + """<|im_end|>""" if h.get("bot") else "") + "\n" + history_text
-            if len(history_text) > approx_max_tokens*4:
-                break    
-        return history_text
-    
-    def get_messages_from_history(self, prompt_override, follow_up_questions_prompt, history: Sequence[dict[str, str]], sources: str, approx_max_tokens: int = 1000) -> []:
-        '''
-        Generate messages needed for chat Completion api
-        '''
-        messages = []
-        token_count = 0
-        if prompt_override is None:
-            system_message = self.system_message_chat_conversation.format(injected_prompt="", follow_up_questions_prompt=follow_up_questions_prompt)
-        elif prompt_override.startswith(">>>"):
-            system_message = self.system_message_chat_conversation.format(injected_prompt=prompt_override[3:] + "\n", follow_up_questions_prompt=follow_up_questions_prompt)
-        else:
-            system_message = prompt_override.format(follow_up_questions_prompt=follow_up_questions_prompt)
+    def get_messages_from_history(self, system_prompt: str, model_id: str, history: Sequence[dict[str, str]], user_conv: str, few_shots = [], max_tokens: int = 4096) -> []:
+        message_builder = MessageBuilder(system_prompt, model_id)
+
+        # Add examples to show the chat what responses we want. It will try to mimic any responses and make sure they match the rules laid out in the system message.
+        for shot in few_shots:
+            message_builder.append_message(shot.get('role'), shot.get('content'))
+
+        user_content = user_conv
+        append_index = len(few_shots) + 1
+
+        message_builder.append_message(self.USER, user_content, index=append_index)
 
-        messages.append({"role":self.SYSTEM, "content": system_message})
-        token_count += self.num_tokens_from_messages(messages[-1], self.chatgpt_model)
-        
-        # latest conversation
-        user_content = history[-1]["user"] + " \nSources:" + sources
-        messages.append({"role": self.USER, "content": user_content})
-        token_count += token_count + self.num_tokens_from_messages(messages[-1], self.chatgpt_model)
-
-        '''
-        Enqueue in reverse order
-        if limit exceeds truncate old messages 
-        leaving system message behind
-        Keep track of token count for each conversation
-        If token count exceeds limit, break
-        '''
         for h in reversed(history[:-1]):
             if h.get("bot"):
-                messages.insert(1, {"role": self.ASSISTANT, "content" : h.get("bot")})
-                token_count += self.num_tokens_from_messages(messages[1], self.chatgpt_model)
-            messages.insert(1, {"role": self.USER, "content" : h.get("user")})
-            token_count += self.num_tokens_from_messages(messages[1], self.chatgpt_model)
-            if token_count > approx_max_tokens*4:
+                message_builder.append_message(self.ASSISTANT, h.get('bot'), index=append_index)
+            message_builder.append_message(self.USER, h.get('user'), index=append_index)
+            if message_builder.token_length > max_tokens:
                 break
-        return messages
-    
-    def num_tokens_from_messages(self, message: dict[str,str], model: str) -> int:
-        """
-        Calculate the number of tokens required to encode a message.
-        Args:
-            message (dict): The message to encode, represented as a dictionary.
-            model (str): The name of the model to use for encoding.
-        Returns:
-            int: The total number of tokens required to encode the message.
-        Example:
-            message = {'role': 'user', 'content': 'Hello, how are you?'}
-            model = 'gpt-3.5-turbo'
-            num_tokens_from_messages(message, model)
-            output: 11
-        """
-        encoding = tiktoken.encoding_for_model(self.get_oai_chatmodel_tiktok(model))
-        num_tokens = 0
-        num_tokens += 2  # For "role" and "content" keys
-        for key, value in message.items():
-            num_tokens += len(encoding.encode(value))
-        return num_tokens
-
-    def get_oai_chatmodel_tiktok(self, aoaimodel: str):
-        if aoaimodel == "" or aoaimodel is None:
-            raise Exception("Expected AOAI chatGPT model name")
         
-        return "gpt-3.5-turbo" if aoaimodel == "gpt-35-turbo" else aoaimodel
+        messages = message_builder.messages
+        return messages
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
@@ -1,10 +1,12 @@
 import openai
+
 from approaches.approach import Approach
 from azure.search.documents import SearchClient
 from azure.search.documents.models import QueryType
 from text import nonewlines
 from typing import Any
 
+from core.messagebuilder import MessageBuilder
 
 class RetrieveThenReadApproach(Approach):
     """
@@ -13,39 +15,30 @@ class RetrieveThenReadApproach(Approach):
     (answer) with that prompt.
     """
 
-    template = \
+    system_chat_template = \
 "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions. " + \
 "Use 'you' to refer to the individual asking the questions even if they ask with 'I'. " + \
 "Answer the following question using only the data provided in the sources below. " + \
 "For tabular information return it as an html table. Do not return markdown format. "  + \
 "Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. " + \
-"If you cannot answer using the sources below, say you don't know. " + \
-"""
+"If you cannot answer using the sources below, say you don't know. Use below example to answer"
 
-###
-Question: 'What is the deductible for the employee plan for a visit to Overlake in Bellevue?'
+    #shots/sample conversation
+    question = """
+'What is the deductible for the employee plan for a visit to Overlake in Bellevue?' 
 
 Sources:
 info1.txt: deductibles depend on whether you are in-network or out-of-network. In-network deductibles are $500 for employee and $1000 for family. Out-of-network deductibles are $1000 for employee and $2000 for family.
 info2.pdf: Overlake is in-network for the employee plan.
 info3.pdf: Overlake is the name of the area that includes a park and ride near Bellevue.
 info4.pdf: In-network institutions include Overlake, Swedish and others in the region
-
-Answer:
-In-network deductibles are $500 for employee and $1000 for family [info1.txt] and Overlake is in-network for the employee plan [info2.pdf][info4.pdf].
-
-###
-Question: '{q}'?
-
-Sources:
-{retrieved}
-
-Answer:
 """
+    answer = "In-network deductibles are $500 for employee and $1000 for family [info1.txt] and Overlake is in-network for the employee plan [info2.pdf][info4.pdf]."
 
-    def __init__(self, search_client: SearchClient, openai_deployment: str, sourcepage_field: str, content_field: str):
+    def __init__(self, search_client: SearchClient, openai_deployment: str, chatgpt_model: str, sourcepage_field: str, content_field: str):
         self.search_client = search_client
         self.openai_deployment = openai_deployment
+        self.chatgpt_model = chatgpt_model
         self.sourcepage_field = sourcepage_field
         self.content_field = content_field
 
@@ -72,13 +65,23 @@ def run(self, q: str, overrides: dict[str, Any]) -> Any:
             results = [doc[self.sourcepage_field] + ": " + nonewlines(doc[self.content_field]) for doc in r]
         content = "\n".join(results)
 
-        prompt = (overrides.get("prompt_template") or self.template).format(q=q, retrieved=content)
-        completion = openai.Completion.create(
-            engine=self.openai_deployment, 
-            prompt=prompt, 
+        message_builder = MessageBuilder(overrides.get("prompt_template") or self.system_chat_template, self.chatgpt_model);
+
+        # add user question
+        user_content = q + "\n" + "Sources:\n {content}".format(content=content)
+        message_builder.append_message('user', user_content)
+
+        # Add shots/samples. This helps model to mimic response and make sure they match rules laid out in system message.
+        message_builder.append_message('assistant', self.answer)
+        message_builder.append_message('user', self.question)
+        
+        messages = message_builder.messages
+        chat_completion = openai.ChatCompletion.create(
+            deployment_id=self.openai_deployment,
+            model=self.chatgpt_model,
+            messages=messages, 
             temperature=overrides.get("temperature") or 0.3, 
             max_tokens=1024, 
-            n=1, 
-            stop=["\n"])
-
-        return {"data_points": results, "answer": completion.choices[0].text, "thoughts": f"Question:<br>{q}<br><br>Prompt:<br>" + prompt.replace('\n', '<br>')}
+            n=1)
+        
+        return {"data_points": results, "answer": chat_completion.choices[0].message.content, "thoughts": f"Question:<br>{q}<br><br>Prompt:<br>" + '\n\n'.join([str(message) for message in messages])}
diff --git a/app/backend/core/messagebuilder.py b/app/backend/core/messagebuilder.py
@@ -0,0 +1,25 @@
+from core.modelhelper import num_tokens_from_messages
+
+
+class MessageBuilder:
+    """
+      A class for building and managing messages in a chat conversation.
+      Attributes:
+          message (list): A list of dictionaries representing chat messages.
+          model (str): The name of the ChatGPT model.
+          token_count (int): The total number of tokens in the conversation.
+      Methods:
+          __init__(self, system_content: str, chatgpt_model: str): Initializes the MessageBuilder instance.
+          append_message(self, role: str, content: str, index: int = 1): Appends a new message to the conversation.
+      """
+
+    def __init__(self, system_content: str, chatgpt_model: str):
+        self.messages = [{'role': 'system', 'content': system_content}]
+        self.model = chatgpt_model
+        self.token_length = num_tokens_from_messages(
+            self.messages[-1], self.model)
+
+    def append_message(self, role: str, content: str, index: int = 1):
+        self.messages.insert(index, {'role': role, 'content': content})
+        self.token_length += num_tokens_from_messages(
+            self.messages[index], self.model)
diff --git a/app/backend/core/modelhelper.py b/app/backend/core/modelhelper.py
@@ -0,0 +1,49 @@
+import tiktoken
+
+MODELS_2_TOKEN_LIMITS = {
+    "gpt-35-turbo": 4000,
+    "gpt-3.5-turbo": 4000,
+    "gpt-35-turbo-16k": 16000,
+    "gpt-3.5-turbo-16k": 16000,
+    "gpt-4": 8100,
+    "gpt-4-32k": 32000
+}
+
+AOAI_2_OAI = {
+    "gpt-35-turbo": "gpt-3.5-turbo",
+    "gpt-35-turbo-16k": "gpt-3.5-turbo-16k"
+}
+
+
+def get_token_limit(model_id: str) -> int:
+    if model_id not in MODELS_2_TOKEN_LIMITS:
+        raise ValueError("Expected Model Gpt-35-turbo and above")
+    return MODELS_2_TOKEN_LIMITS.get(model_id)
+
+
+def num_tokens_from_messages(message: dict[str, str], model: str) -> int:
+    """
+    Calculate the number of tokens required to encode a message.
+    Args:
+        message (dict): The message to encode, represented as a dictionary.
+        model (str): The name of the model to use for encoding.
+    Returns:
+        int: The total number of tokens required to encode the message.
+    Example:
+        message = {'role': 'user', 'content': 'Hello, how are you?'}
+        model = 'gpt-3.5-turbo'
+        num_tokens_from_messages(message, model)
+        output: 11
+    """
+    encoding = tiktoken.encoding_for_model(get_oai_chatmodel_tiktok(model))
+    num_tokens = 2  # For "role" and "content" keys
+    for key, value in message.items():
+        num_tokens += len(encoding.encode(value))
+    return num_tokens
+
+
+def get_oai_chatmodel_tiktok(aoaimodel: str) -> str:
+    if aoaimodel == "" or aoaimodel is None:
+        raise ValueError("Expected AOAI chatGPT model name")
+
+    return AOAI_2_OAI.get(aoaimodel)

Original file line number	Diff line number	Diff line change
`@@ -56,13 +56,13 @@`
`56`	`56`	`# Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns`
`57`	`57`	`# or some derivative, here we include several for exploration purposes`
`58`	`58`	`ask_approaches = {`
`59`		`- "rtr": RetrieveThenReadApproach(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),`
	`59`	`+ "rtr": RetrieveThenReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),`
`60`	`60`	`"rrr": ReadRetrieveReadApproach(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT),`
`61`	`61`	`"rda": ReadDecomposeAsk(search_client, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`chat_approaches = {`
`65`		`- "rrr": ChatReadRetrieveReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, AZURE_OPENAI_GPT_DEPLOYMENT, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)`
	`65`	`+ "rrr": ChatReadRetrieveReadApproach(search_client, AZURE_OPENAI_CHATGPT_DEPLOYMENT, AZURE_OPENAI_CHATGPT_MODEL, KB_FIELDS_SOURCEPAGE, KB_FIELDS_CONTENT)`
`66`	`66`	`}`
`67`	`67`
`68`	`68`	`app = Flask(__name__)`