Skip to content

Commit 9722c78

Browse files
authored
Changes for easier working with local models (#1992)
* Changes for easier working with local models * Markdown lint * Ollama docs
1 parent 162a36d commit 9722c78

File tree

6 files changed

+55
-11
lines changed

6 files changed

+55
-11
lines changed

app/backend/approaches/approach.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ class ThoughtStep:
9191

9292

9393
class Approach(ABC):
94+
95+
# Allows usage of non-GPT model even if no tokenizer is available for accurate token counting
96+
# Useful for using local small language models, for example
97+
ALLOW_NON_GPT_MODELS = True
98+
9499
def __init__(
95100
self,
96101
search_client: SearchClient,

app/backend/approaches/chatreadretrieveread.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(
5151
self.content_field = content_field
5252
self.query_language = query_language
5353
self.query_speller = query_speller
54-
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
54+
self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
5555

5656
@property
5757
def system_message_chat_conversation(self):
@@ -133,6 +133,7 @@ async def run_until_final_call(
133133
past_messages=messages[:-1],
134134
new_user_content=user_query_request,
135135
max_tokens=self.chatgpt_token_limit - query_response_token_limit,
136+
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
136137
)
137138

138139
chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
@@ -187,6 +188,7 @@ async def run_until_final_call(
187188
# Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt.
188189
new_user_content=original_user_query + "\n\nSources:\n" + content,
189190
max_tokens=self.chatgpt_token_limit - response_token_limit,
191+
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
190192
)
191193

192194
data_points = {"text": sources_content}

app/backend/approaches/chatreadretrievereadvision.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(
6363
self.query_speller = query_speller
6464
self.vision_endpoint = vision_endpoint
6565
self.vision_token_provider = vision_token_provider
66-
self.chatgpt_token_limit = get_token_limit(gpt4v_model)
66+
self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
6767

6868
@property
6969
def system_message_chat_conversation(self):
@@ -188,6 +188,7 @@ async def run_until_final_call(
188188
past_messages=messages[:-1],
189189
new_user_content=user_content,
190190
max_tokens=self.chatgpt_token_limit - response_token_limit,
191+
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
191192
)
192193

193194
data_points = {

app/backend/approaches/retrievethenread.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __init__(
6666
self.content_field = content_field
6767
self.query_language = query_language
6868
self.query_speller = query_speller
69-
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
69+
self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS)
7070

7171
async def run(
7272
self,
@@ -121,6 +121,7 @@ async def run(
121121
few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}],
122122
new_user_content=user_content,
123123
max_tokens=self.chatgpt_token_limit - response_token_limit,
124+
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
124125
)
125126

126127
chat_completion = await self.openai_client.chat.completions.create(

app/backend/approaches/retrievethenreadvision.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __init__(
6666
self.query_speller = query_speller
6767
self.vision_endpoint = vision_endpoint
6868
self.vision_token_provider = vision_token_provider
69-
self.gpt4v_token_limit = get_token_limit(gpt4v_model)
69+
self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS)
7070

7171
async def run(
7272
self,
@@ -140,6 +140,7 @@ async def run(
140140
system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v),
141141
new_user_content=user_content,
142142
max_tokens=self.gpt4v_token_limit - response_token_limit,
143+
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
143144
)
144145
chat_completion = await self.openai_client.chat.completions.create(
145146
model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,

docs/localdev.md

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,25 +46,59 @@ You may want to save costs by developing against a local LLM server, such as
4646
[llamafile](https://github.com/Mozilla-Ocho/llamafile/). Note that a local LLM
4747
will generally be slower and not as sophisticated.
4848

49-
Once you've got your local LLM running and serving an OpenAI-compatible endpoint, set these environment variables:
49+
Once the local LLM server is running and serving an OpenAI-compatible endpoint, set these environment variables:
5050

5151
```shell
52+
azd env set USE_VECTORS false
5253
azd env set OPENAI_HOST local
5354
azd env set OPENAI_BASE_URL <your local endpoint>
55+
azd env set AZURE_OPENAI_CHATGPT_MODEL local-model-name
5456
```
5557

56-
For example, to point at a local llamafile server running on its default port:
58+
Then restart the local development server.
59+
You should now be able to use the "Ask" tab.
60+
61+
⚠️ Limitations:
62+
63+
- The "Chat" tab will only work if the local language model supports function calling.
64+
- Your search mode must be text only (no vectors), since the search index is only populated with OpenAI-generated embeddings, and the local OpenAI host can't generate those.
65+
- The conversation history will be truncated using the GPT tokenizers, which may not be the same as the local model's tokenizer, so if you have a long conversation, you may end up with token limit errors.
66+
67+
> [!NOTE]
68+
> You must set `OPENAI_HOST` back to a non-local value ("azure", "azure_custom", or "openai")
69+
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.
70+
71+
### Using Ollama server
72+
73+
For example, to point at a local Ollama server running the `llama3.1:8b` model:
74+
75+
```shell
76+
azd env set OPENAI_HOST local
77+
azd env set OPENAI_BASE_URL http://localhost:11434/v1
78+
azd env set AZURE_OPENAI_CHATGPT_MODEL llama3.1:8b
79+
azd env set USE_VECTORS false
80+
```
81+
82+
If you're running the app inside a VS Code Dev Container, use this local URL instead:
83+
84+
```shell
85+
azd env set OPENAI_BASE_URL http://host.docker.internal:11434/v1
86+
```
87+
88+
### Using llamafile server
89+
90+
To point at a local llamafile server running on its default port:
5791

5892
```shell
93+
azd env set OPENAI_HOST local
5994
azd env set OPENAI_BASE_URL http://localhost:8080/v1
95+
azd env set USE_VECTORS false
6096
```
6197

62-
If you're running inside a dev container, use this local URL instead:
98+
Llamafile does *not* require a model name to be specified.
99+
100+
If you're running the app inside a VS Code Dev Container, use this local URL instead:
63101

64102
```shell
65103
azd env set OPENAI_BASE_URL http://host.docker.internal:8080/v1
66104
```
67-
68-
> [!NOTE]
69-
> You must set this back to a non-local value ("azure", "azure_custom", or "openai")
70-
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.

0 commit comments

Comments
 (0)