diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index f1fb0a444d..ad81a0a7b9 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -91,6 +91,11 @@ class ThoughtStep: class Approach(ABC): + + # Allows usage of non-GPT model even if no tokenizer is available for accurate token counting + # Useful for using local small language models, for example + ALLOW_NON_GPT_MODELS = True + def __init__( self, search_client: SearchClient, diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py index 95ca08f0f0..b752547e71 100644 --- a/app/backend/approaches/chatreadretrieveread.py +++ b/app/backend/approaches/chatreadretrieveread.py @@ -51,7 +51,7 @@ def __init__( self.content_field = content_field self.query_language = query_language self.query_speller = query_speller - self.chatgpt_token_limit = get_token_limit(chatgpt_model) + self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS) @property def system_message_chat_conversation(self): @@ -133,6 +133,7 @@ async def run_until_final_call( past_messages=messages[:-1], new_user_content=user_query_request, max_tokens=self.chatgpt_token_limit - query_response_token_limit, + fallback_to_default=self.ALLOW_NON_GPT_MODELS, ) chat_completion: ChatCompletion = await self.openai_client.chat.completions.create( @@ -187,6 +188,7 @@ async def run_until_final_call( # Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt. new_user_content=original_user_query + "\n\nSources:\n" + content, max_tokens=self.chatgpt_token_limit - response_token_limit, + fallback_to_default=self.ALLOW_NON_GPT_MODELS, ) data_points = {"text": sources_content} diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py index df64f266f7..6b48643077 100644 --- a/app/backend/approaches/chatreadretrievereadvision.py +++ b/app/backend/approaches/chatreadretrievereadvision.py @@ -63,7 +63,7 @@ def __init__( self.query_speller = query_speller self.vision_endpoint = vision_endpoint self.vision_token_provider = vision_token_provider - self.chatgpt_token_limit = get_token_limit(gpt4v_model) + self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS) @property def system_message_chat_conversation(self): @@ -188,6 +188,7 @@ async def run_until_final_call( past_messages=messages[:-1], new_user_content=user_content, max_tokens=self.chatgpt_token_limit - response_token_limit, + fallback_to_default=self.ALLOW_NON_GPT_MODELS, ) data_points = { diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py index d5b05a0fbe..5c73def39e 100644 --- a/app/backend/approaches/retrievethenread.py +++ b/app/backend/approaches/retrievethenread.py @@ -66,7 +66,7 @@ def __init__( self.content_field = content_field self.query_language = query_language self.query_speller = query_speller - self.chatgpt_token_limit = get_token_limit(chatgpt_model) + self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS) async def run( self, @@ -121,6 +121,7 @@ async def run( few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}], new_user_content=user_content, max_tokens=self.chatgpt_token_limit - response_token_limit, + fallback_to_default=self.ALLOW_NON_GPT_MODELS, ) chat_completion = await self.openai_client.chat.completions.create( diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py index 5e318ce8ea..cd0bf0d08d 100644 --- a/app/backend/approaches/retrievethenreadvision.py +++ b/app/backend/approaches/retrievethenreadvision.py @@ -66,7 +66,7 @@ def __init__( self.query_speller = query_speller self.vision_endpoint = vision_endpoint self.vision_token_provider = vision_token_provider - self.gpt4v_token_limit = get_token_limit(gpt4v_model) + self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS) async def run( self, @@ -140,6 +140,7 @@ async def run( system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v), new_user_content=user_content, max_tokens=self.gpt4v_token_limit - response_token_limit, + fallback_to_default=self.ALLOW_NON_GPT_MODELS, ) chat_completion = await self.openai_client.chat.completions.create( model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model, diff --git a/docs/localdev.md b/docs/localdev.md index cb717b9d1b..39959db690 100644 --- a/docs/localdev.md +++ b/docs/localdev.md @@ -46,25 +46,59 @@ You may want to save costs by developing against a local LLM server, such as [llamafile](https://github.com/Mozilla-Ocho/llamafile/). Note that a local LLM will generally be slower and not as sophisticated. -Once you've got your local LLM running and serving an OpenAI-compatible endpoint, set these environment variables: +Once the local LLM server is running and serving an OpenAI-compatible endpoint, set these environment variables: ```shell +azd env set USE_VECTORS false azd env set OPENAI_HOST local azd env set OPENAI_BASE_URL +azd env set AZURE_OPENAI_CHATGPT_MODEL local-model-name ``` -For example, to point at a local llamafile server running on its default port: +Then restart the local development server. +You should now be able to use the "Ask" tab. + +⚠️ Limitations: + +- The "Chat" tab will only work if the local language model supports function calling. +- Your search mode must be text only (no vectors), since the search index is only populated with OpenAI-generated embeddings, and the local OpenAI host can't generate those. +- The conversation history will be truncated using the GPT tokenizers, which may not be the same as the local model's tokenizer, so if you have a long conversation, you may end up with token limit errors. + +> [!NOTE] +> You must set `OPENAI_HOST` back to a non-local value ("azure", "azure_custom", or "openai") +> before running `azd up` or `azd provision`, since the deployed backend can't access your local server. + +### Using Ollama server + +For example, to point at a local Ollama server running the `llama3.1:8b` model: + +```shell +azd env set OPENAI_HOST local +azd env set OPENAI_BASE_URL http://localhost:11434/v1 +azd env set AZURE_OPENAI_CHATGPT_MODEL llama3.1:8b +azd env set USE_VECTORS false +``` + +If you're running the app inside a VS Code Dev Container, use this local URL instead: + +```shell +azd env set OPENAI_BASE_URL http://host.docker.internal:11434/v1 +``` + +### Using llamafile server + +To point at a local llamafile server running on its default port: ```shell +azd env set OPENAI_HOST local azd env set OPENAI_BASE_URL http://localhost:8080/v1 +azd env set USE_VECTORS false ``` -If you're running inside a dev container, use this local URL instead: +Llamafile does *not* require a model name to be specified. + +If you're running the app inside a VS Code Dev Container, use this local URL instead: ```shell azd env set OPENAI_BASE_URL http://host.docker.internal:8080/v1 ``` - -> [!NOTE] -> You must set this back to a non-local value ("azure", "azure_custom", or "openai") -> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.