Changes for easier working with local models (#1992)

pamelafox · web-flow · commit 9722c7882260 · 2024-09-26T16:33:26.000-07:00
* Changes for easier working with local models

* Markdown lint

* Ollama docs
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -91,6 +91,11 @@ class ThoughtStep:
 
 
 class Approach(ABC):
+
+    # Allows usage of non-GPT model even if no tokenizer is available for accurate token counting
+    # Useful for using local small language models, for example
+    ALLOW_NON_GPT_MODELS = True
+
     def __init__(
         self,
         search_client: SearchClient,
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
@@ -51,7 +51,7 @@ def __init__(
         self.content_field = content_field
         self.query_language = query_language
         self.query_speller = query_speller
-        self.chatgpt_token_limit = get_token_limit(chatgpt_model)
+        self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
 
     @property
     def system_message_chat_conversation(self):
@@ -133,6 +133,7 @@ async def run_until_final_call(
             past_messages=messages[:-1],
             new_user_content=user_query_request,
             max_tokens=self.chatgpt_token_limit - query_response_token_limit,
+            fallback_to_default=self.ALLOW_NON_GPT_MODELS,
         )
 
         chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
@@ -187,6 +188,7 @@ async def run_until_final_call(
             # Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt.
             new_user_content=original_user_query + "\n\nSources:\n" + content,
             max_tokens=self.chatgpt_token_limit - response_token_limit,
+            fallback_to_default=self.ALLOW_NON_GPT_MODELS,
         )
 
         data_points = {"text": sources_content}
diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py
@@ -63,7 +63,7 @@ def __init__(
         self.query_speller = query_speller
         self.vision_endpoint = vision_endpoint
         self.vision_token_provider = vision_token_provider
-        self.chatgpt_token_limit = get_token_limit(gpt4v_model)
+        self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
 
     @property
     def system_message_chat_conversation(self):
@@ -188,6 +188,7 @@ async def run_until_final_call(
             past_messages=messages[:-1],
             new_user_content=user_content,
             max_tokens=self.chatgpt_token_limit - response_token_limit,
+            fallback_to_default=self.ALLOW_NON_GPT_MODELS,
         )
 
         data_points = {
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
@@ -66,7 +66,7 @@ def __init__(
         self.content_field = content_field
         self.query_language = query_language
         self.query_speller = query_speller
-        self.chatgpt_token_limit = get_token_limit(chatgpt_model)
+        self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS)
 
     async def run(
         self,
@@ -121,6 +121,7 @@ async def run(
             few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}],
             new_user_content=user_content,
             max_tokens=self.chatgpt_token_limit - response_token_limit,
+            fallback_to_default=self.ALLOW_NON_GPT_MODELS,
         )
 
         chat_completion = await self.openai_client.chat.completions.create(
diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py
@@ -66,7 +66,7 @@ def __init__(
         self.query_speller = query_speller
         self.vision_endpoint = vision_endpoint
         self.vision_token_provider = vision_token_provider
-        self.gpt4v_token_limit = get_token_limit(gpt4v_model)
+        self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS)
 
     async def run(
         self,
@@ -140,6 +140,7 @@ async def run(
             system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v),
             new_user_content=user_content,
             max_tokens=self.gpt4v_token_limit - response_token_limit,
+            fallback_to_default=self.ALLOW_NON_GPT_MODELS,
         )
         chat_completion = await self.openai_client.chat.completions.create(
             model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
diff --git a/docs/localdev.md b/docs/localdev.md
@@ -46,25 +46,59 @@ You may want to save costs by developing against a local LLM server, such as
 [llamafile](https://github.com/Mozilla-Ocho/llamafile/). Note that a local LLM
 will generally be slower and not as sophisticated.
 
-Once you've got your local LLM running and serving an OpenAI-compatible endpoint, set these environment variables:
+Once the local LLM server is running and serving an OpenAI-compatible endpoint, set these environment variables:
 
 ```shell
+azd env set USE_VECTORS false
 azd env set OPENAI_HOST local
 azd env set OPENAI_BASE_URL <your local endpoint>
+azd env set AZURE_OPENAI_CHATGPT_MODEL local-model-name
 ```
 
-For example, to point at a local llamafile server running on its default port:
+Then restart the local development server.
+You should now be able to use the "Ask" tab.
+
+⚠️ Limitations:
+
+- The "Chat" tab will only work if the local language model supports function calling.
+- Your search mode must be text only (no vectors), since the search index is only populated with OpenAI-generated embeddings, and the local OpenAI host can't generate those.
+- The conversation history will be truncated using the GPT tokenizers, which may not be the same as the local model's tokenizer, so if you have a long conversation, you may end up with token limit errors.
+
+> [!NOTE]
+> You must set `OPENAI_HOST` back to a non-local value ("azure", "azure_custom", or "openai")
+> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.
+
+### Using Ollama server
+
+For example, to point at a local Ollama server running the `llama3.1:8b` model:
+
+```shell
+azd env set OPENAI_HOST local
+azd env set OPENAI_BASE_URL http://localhost:11434/v1
+azd env set AZURE_OPENAI_CHATGPT_MODEL llama3.1:8b
+azd env set USE_VECTORS false
+```
+
+If you're running the app inside a VS Code Dev Container, use this local URL instead:
+
+```shell
+azd env set OPENAI_BASE_URL http://host.docker.internal:11434/v1
+```
+
+### Using llamafile server
+
+To point at a local llamafile server running on its default port:
 
 ```shell
+azd env set OPENAI_HOST local
 azd env set OPENAI_BASE_URL http://localhost:8080/v1
+azd env set USE_VECTORS false
 ```
 
-If you're running inside a dev container, use this local URL instead:
+Llamafile does *not* require a model name to be specified.
+
+If you're running the app inside a VS Code Dev Container, use this local URL instead:
 
 ```shell
 azd env set OPENAI_BASE_URL http://host.docker.internal:8080/v1
 ```
-
-> [!NOTE]
-> You must set this back to a non-local value ("azure", "azure_custom", or "openai")
-> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.