Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class ThoughtStep:


class Approach(ABC):

# Allows usage of non-GPT model even if no tokenizer is available for accurate token counting
# Useful for using local small language models, for example
ALLOW_NON_GPT_MODELS = True

def __init__(
self,
search_client: SearchClient,
Expand Down
4 changes: 3 additions & 1 deletion app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
self.content_field = content_field
self.query_language = query_language
self.query_speller = query_speller
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)

@property
def system_message_chat_conversation(self):
Expand Down Expand Up @@ -133,6 +133,7 @@ async def run_until_final_call(
past_messages=messages[:-1],
new_user_content=user_query_request,
max_tokens=self.chatgpt_token_limit - query_response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
Expand Down Expand Up @@ -187,6 +188,7 @@ async def run_until_final_call(
# Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt.
new_user_content=original_user_query + "\n\nSources:\n" + content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

data_points = {"text": sources_content}
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/chatreadretrievereadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(
self.query_speller = query_speller
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
self.chatgpt_token_limit = get_token_limit(gpt4v_model)
self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)

@property
def system_message_chat_conversation(self):
Expand Down Expand Up @@ -188,6 +188,7 @@ async def run_until_final_call(
past_messages=messages[:-1],
new_user_content=user_content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

data_points = {
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
self.content_field = content_field
self.query_language = query_language
self.query_speller = query_speller
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS)

async def run(
self,
Expand Down Expand Up @@ -121,6 +121,7 @@ async def run(
few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}],
new_user_content=user_content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

chat_completion = await self.openai_client.chat.completions.create(
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/retrievethenreadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
self.query_speller = query_speller
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
self.gpt4v_token_limit = get_token_limit(gpt4v_model)
self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS)

async def run(
self,
Expand Down Expand Up @@ -140,6 +140,7 @@ async def run(
system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v),
new_user_content=user_content,
max_tokens=self.gpt4v_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)
chat_completion = await self.openai_client.chat.completions.create(
model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
Expand Down
48 changes: 41 additions & 7 deletions docs/localdev.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,59 @@ You may want to save costs by developing against a local LLM server, such as
[llamafile](https://github.com/Mozilla-Ocho/llamafile/). Note that a local LLM
will generally be slower and not as sophisticated.

Once you've got your local LLM running and serving an OpenAI-compatible endpoint, set these environment variables:
Once the local LLM server is running and serving an OpenAI-compatible endpoint, set these environment variables:

```shell
azd env set USE_VECTORS false
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL <your local endpoint>
azd env set AZURE_OPENAI_CHATGPT_MODEL local-model-name
```

For example, to point at a local llamafile server running on its default port:
Then restart the local development server.
You should now be able to use the "Ask" tab.

⚠️ Limitations:

- The "Chat" tab will only work if the local language model supports function calling.
- Your search mode must be text only (no vectors), since the search index is only populated with OpenAI-generated embeddings, and the local OpenAI host can't generate those.
- The conversation history will be truncated using the GPT tokenizers, which may not be the same as the local model's tokenizer, so if you have a long conversation, you may end up with token limit errors.

> [!NOTE]
> You must set `OPENAI_HOST` back to a non-local value ("azure", "azure_custom", or "openai")
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.

### Using Ollama server

For example, to point at a local Ollama server running the `llama3.1:8b` model:

```shell
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL http://localhost:11434/v1
azd env set AZURE_OPENAI_CHATGPT_MODEL llama3.1:8b
azd env set USE_VECTORS false
```

If you're running the app inside a VS Code Dev Container, use this local URL instead:

```shell
azd env set OPENAI_BASE_URL http://host.docker.internal:11434/v1
```

### Using llamafile server

To point at a local llamafile server running on its default port:

```shell
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL http://localhost:8080/v1
azd env set USE_VECTORS false
```

If you're running inside a dev container, use this local URL instead:
Llamafile does *not* require a model name to be specified.

If you're running the app inside a VS Code Dev Container, use this local URL instead:

```shell
azd env set OPENAI_BASE_URL http://host.docker.internal:8080/v1
```

> [!NOTE]
> You must set this back to a non-local value ("azure", "azure_custom", or "openai")
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.
Loading