Skip to content

Commit 6cc5d19

Browse files
Add support for reasoning models and token usage display (#2448)
* WIP * WIP * ruff, black * adding usage * mypy * ruff, black * mypy, ruff, black, and update generate thought steps * fix comments, set answer thought tag on streaming approaches * fixing frontend * fixing backend + frontend * token graph fixup * fix token usage for non-streaming response * re-style token graph * updates * adddressing feedback * ruff, black * prettify * fixing typing errors * ruff, black * mypy, ruff * prettier * trying to fix test failures * fixing test failures * fix streaming allowed setup * black * try to fix e2e test * add tests; updating env vars * adding tests * rerecording * more recording * add tests; ruff, black * run prettier * update docs * fix test * fix linter * Fleshed out readme * Adding reasoning to deploy featureS * Make changes from pamelas feedback * undangle that comma --------- Co-authored-by: Pamela Fox <[email protected]>
1 parent 9afbefa commit 6cc5d19

File tree

104 files changed

+1847
-224
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+1847
-224
lines changed

.azdo/pipelines/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ steps:
6969
AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY)
7070
AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION)
7171
AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU)
72+
AZURE_OPENAI_REASONING_EFFORT: $(AZURE_OPENAI_REASONING_EFFORT)
7273
AZURE_OPENAI_EMB_MODEL_NAME: $(AZURE_OPENAI_EMB_MODEL_NAME)
7374
AZURE_OPENAI_EMB_DEPLOYMENT: $(AZURE_OPENAI_EMB_DEPLOYMENT)
7475
AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY)

.github/workflows/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ jobs:
6060
AZURE_OPENAI_CHATGPT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT }}
6161
AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY }}
6262
AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION }}
63+
AZURE_OPENAI_REASONING_EFFORT: ${{ vars.AZURE_OPENAI_REASONING_EFFORT }}
6364
AZURE_OPENAI_EMB_MODEL_NAME: ${{ vars.AZURE_OPENAI_EMB_MODEL_NAME }}
6465
AZURE_OPENAI_EMB_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT }}
6566
AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ You can find extensive documentation in the [docs](docs/README.md) folder:
256256
- [All features](docs/deploy_features.md)
257257
- [Login and access control](docs/login_and_acl.md)
258258
- [GPT-4 Turbo with Vision](docs/gpt4v.md)
259+
- [Reasoning](docs/reasoning.md)
259260
- [Private endpoints](docs/deploy_private.md)
260261
- [Sharing deployment environments](docs/sharing_environments.md)
261262
- [Local development](docs/localdev.md)

app/backend/app.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,13 @@
6565
CONFIG_CHAT_HISTORY_COSMOS_ENABLED,
6666
CONFIG_CHAT_VISION_APPROACH,
6767
CONFIG_CREDENTIAL,
68+
CONFIG_DEFAULT_REASONING_EFFORT,
6869
CONFIG_GPT4V_DEPLOYED,
6970
CONFIG_INGESTER,
7071
CONFIG_LANGUAGE_PICKER_ENABLED,
7172
CONFIG_OPENAI_CLIENT,
7273
CONFIG_QUERY_REWRITING_ENABLED,
74+
CONFIG_REASONING_EFFORT_ENABLED,
7375
CONFIG_SEARCH_CLIENT,
7476
CONFIG_SEMANTIC_RANKER_DEPLOYED,
7577
CONFIG_SPEECH_INPUT_ENABLED,
@@ -79,6 +81,7 @@
7981
CONFIG_SPEECH_SERVICE_LOCATION,
8082
CONFIG_SPEECH_SERVICE_TOKEN,
8183
CONFIG_SPEECH_SERVICE_VOICE,
84+
CONFIG_STREAMING_ENABLED,
8285
CONFIG_USER_BLOB_CONTAINER_CLIENT,
8386
CONFIG_USER_UPLOAD_ENABLED,
8487
CONFIG_VECTOR_SEARCH_ENABLED,
@@ -293,6 +296,9 @@ def config():
293296
"showGPT4VOptions": current_app.config[CONFIG_GPT4V_DEPLOYED],
294297
"showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED],
295298
"showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED],
299+
"showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED],
300+
"streamingEnabled": current_app.config[CONFIG_STREAMING_ENABLED],
301+
"defaultReasoningEffort": current_app.config[CONFIG_DEFAULT_REASONING_EFFORT],
296302
"showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
297303
"showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
298304
"showLanguagePicker": current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED],
@@ -423,6 +429,7 @@ async def setup_clients():
423429
OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
424430
OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
425431
OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS") or 1536)
432+
OPENAI_REASONING_EFFORT = os.getenv("AZURE_OPENAI_REASONING_EFFORT")
426433
# Used with Azure OpenAI deployments
427434
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
428435
AZURE_OPENAI_GPT4V_DEPLOYMENT = os.environ.get("AZURE_OPENAI_GPT4V_DEPLOYMENT")
@@ -640,6 +647,13 @@ async def setup_clients():
640647
current_app.config[CONFIG_QUERY_REWRITING_ENABLED] = (
641648
AZURE_SEARCH_QUERY_REWRITING == "true" and AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
642649
)
650+
current_app.config[CONFIG_DEFAULT_REASONING_EFFORT] = OPENAI_REASONING_EFFORT
651+
current_app.config[CONFIG_REASONING_EFFORT_ENABLED] = OPENAI_CHATGPT_MODEL in Approach.GPT_REASONING_MODELS
652+
current_app.config[CONFIG_STREAMING_ENABLED] = (
653+
bool(USE_GPT4V)
654+
or OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS
655+
or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming
656+
)
643657
current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
644658
current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
645659
current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER
@@ -667,6 +681,7 @@ async def setup_clients():
667681
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
668682
query_speller=AZURE_SEARCH_QUERY_SPELLER,
669683
prompt_manager=prompt_manager,
684+
reasoning_effort=OPENAI_REASONING_EFFORT,
670685
)
671686

672687
# ChatReadRetrieveReadApproach is used by /chat for multi-turn conversation
@@ -684,12 +699,26 @@ async def setup_clients():
684699
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
685700
query_speller=AZURE_SEARCH_QUERY_SPELLER,
686701
prompt_manager=prompt_manager,
702+
reasoning_effort=OPENAI_REASONING_EFFORT,
687703
)
688704

689705
if USE_GPT4V:
690706
current_app.logger.info("USE_GPT4V is true, setting up GPT4V approach")
691707
if not AZURE_OPENAI_GPT4V_MODEL:
692708
raise ValueError("AZURE_OPENAI_GPT4V_MODEL must be set when USE_GPT4V is true")
709+
if any(
710+
model in Approach.GPT_REASONING_MODELS
711+
for model in [
712+
OPENAI_CHATGPT_MODEL,
713+
AZURE_OPENAI_GPT4V_MODEL,
714+
AZURE_OPENAI_CHATGPT_DEPLOYMENT,
715+
AZURE_OPENAI_GPT4V_DEPLOYMENT,
716+
]
717+
):
718+
raise ValueError(
719+
"AZURE_OPENAI_CHATGPT_MODEL and AZURE_OPENAI_GPT4V_MODEL must not be a reasoning model when USE_GPT4V is true"
720+
)
721+
693722
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
694723

695724
current_app.config[CONFIG_ASK_VISION_APPROACH] = RetrieveThenReadVisionApproach(

app/backend/approaches/approach.py

Lines changed: 140 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
AsyncGenerator,
77
Awaitable,
88
Callable,
9+
Dict,
910
List,
1011
Optional,
1112
TypedDict,
13+
Union,
1214
cast,
1315
)
1416
from urllib.parse import urljoin
@@ -21,8 +23,15 @@
2123
VectorizedQuery,
2224
VectorQuery,
2325
)
24-
from openai import AsyncOpenAI
25-
from openai.types.chat import ChatCompletionMessageParam
26+
from openai import AsyncOpenAI, AsyncStream
27+
from openai.types import CompletionUsage
28+
from openai.types.chat import (
29+
ChatCompletion,
30+
ChatCompletionChunk,
31+
ChatCompletionMessageParam,
32+
ChatCompletionReasoningEffort,
33+
ChatCompletionToolParam,
34+
)
2635

2736
from approaches.promptmanager import PromptManager
2837
from core.authentication import AuthenticationHelper
@@ -89,8 +98,59 @@ class ThoughtStep:
8998
description: Optional[Any]
9099
props: Optional[dict[str, Any]] = None
91100

101+
def update_token_usage(self, usage: CompletionUsage) -> None:
102+
if self.props:
103+
self.props["token_usage"] = TokenUsageProps.from_completion_usage(usage)
104+
105+
106+
@dataclass
107+
class DataPoints:
108+
text: Optional[List[str]] = None
109+
images: Optional[List] = None
110+
111+
112+
@dataclass
113+
class ExtraInfo:
114+
data_points: DataPoints
115+
thoughts: Optional[List[ThoughtStep]] = None
116+
followup_questions: Optional[List[Any]] = None
117+
118+
119+
@dataclass
120+
class TokenUsageProps:
121+
prompt_tokens: int
122+
completion_tokens: int
123+
reasoning_tokens: Optional[int]
124+
total_tokens: int
125+
126+
@classmethod
127+
def from_completion_usage(cls, usage: CompletionUsage) -> "TokenUsageProps":
128+
return cls(
129+
prompt_tokens=usage.prompt_tokens,
130+
completion_tokens=usage.completion_tokens,
131+
reasoning_tokens=(
132+
usage.completion_tokens_details.reasoning_tokens if usage.completion_tokens_details else None
133+
),
134+
total_tokens=usage.total_tokens,
135+
)
136+
137+
138+
# GPT reasoning models don't support the same set of parameters as other models
139+
# https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning
140+
@dataclass
141+
class GPTReasoningModelSupport:
142+
streaming: bool
143+
92144

93145
class Approach(ABC):
146+
# List of GPT reasoning models support
147+
GPT_REASONING_MODELS = {
148+
"o1": GPTReasoningModelSupport(streaming=False),
149+
"o3-mini": GPTReasoningModelSupport(streaming=True),
150+
}
151+
# Set a higher token limit for GPT reasoning models
152+
RESPONSE_DEFAULT_TOKEN_LIMIT = 1024
153+
RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT = 8192
94154

95155
def __init__(
96156
self,
@@ -106,6 +166,7 @@ def __init__(
106166
vision_endpoint: str,
107167
vision_token_provider: Callable[[], Awaitable[str]],
108168
prompt_manager: PromptManager,
169+
reasoning_effort: Optional[str] = None,
109170
):
110171
self.search_client = search_client
111172
self.openai_client = openai_client
@@ -119,6 +180,8 @@ def __init__(
119180
self.vision_endpoint = vision_endpoint
120181
self.vision_token_provider = vision_token_provider
121182
self.prompt_manager = prompt_manager
183+
self.reasoning_effort = reasoning_effort
184+
self.include_token_usage = True
122185

123186
def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
124187
include_category = overrides.get("include_category")
@@ -281,6 +344,81 @@ def get_system_prompt_variables(self, override_prompt: Optional[str]) -> dict[st
281344
else:
282345
return {"override_prompt": override_prompt}
283346

347+
def get_response_token_limit(self, model: str, default_limit: int) -> int:
348+
if model in self.GPT_REASONING_MODELS:
349+
return self.RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT
350+
351+
return default_limit
352+
353+
def create_chat_completion(
354+
self,
355+
chatgpt_deployment: Optional[str],
356+
chatgpt_model: str,
357+
messages: list[ChatCompletionMessageParam],
358+
overrides: dict[str, Any],
359+
response_token_limit: int,
360+
should_stream: bool = False,
361+
tools: Optional[List[ChatCompletionToolParam]] = None,
362+
temperature: Optional[float] = None,
363+
n: Optional[int] = None,
364+
reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
365+
) -> Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]:
366+
if chatgpt_model in self.GPT_REASONING_MODELS:
367+
params: Dict[str, Any] = {
368+
# max_tokens is not supported
369+
"max_completion_tokens": response_token_limit
370+
}
371+
372+
# Adjust parameters for reasoning models
373+
supported_features = self.GPT_REASONING_MODELS[chatgpt_model]
374+
if supported_features.streaming and should_stream:
375+
params["stream"] = True
376+
params["stream_options"] = {"include_usage": True}
377+
params["reasoning_effort"] = reasoning_effort or overrides.get("reasoning_effort") or self.reasoning_effort
378+
379+
else:
380+
# Include parameters that may not be supported for reasoning models
381+
params = {
382+
"max_tokens": response_token_limit,
383+
"temperature": temperature or overrides.get("temperature", 0.3),
384+
}
385+
if should_stream:
386+
params["stream"] = True
387+
params["stream_options"] = {"include_usage": True}
388+
389+
params["tools"] = tools
390+
391+
# Azure OpenAI takes the deployment name as the model name
392+
return self.openai_client.chat.completions.create(
393+
model=chatgpt_deployment if chatgpt_deployment else chatgpt_model,
394+
messages=messages,
395+
seed=overrides.get("seed", None),
396+
n=n or 1,
397+
**params,
398+
)
399+
400+
def format_thought_step_for_chatcompletion(
401+
self,
402+
title: str,
403+
messages: List[ChatCompletionMessageParam],
404+
overrides: dict[str, Any],
405+
model: str,
406+
deployment: Optional[str],
407+
usage: Optional[CompletionUsage] = None,
408+
reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
409+
) -> ThoughtStep:
410+
properties: Dict[str, Any] = {"model": model}
411+
if deployment:
412+
properties["deployment"] = deployment
413+
# Only add reasoning_effort setting if the model supports it
414+
if model in self.GPT_REASONING_MODELS:
415+
properties["reasoning_effort"] = reasoning_effort or overrides.get(
416+
"reasoning_effort", self.reasoning_effort
417+
)
418+
if usage:
419+
properties["token_usage"] = TokenUsageProps.from_completion_usage(usage)
420+
return ThoughtStep(title, messages, properties)
421+
284422
async def run(
285423
self,
286424
messages: list[ChatCompletionMessageParam],

app/backend/approaches/chatapproach.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,29 @@
11
import json
22
import re
33
from abc import ABC, abstractmethod
4-
from typing import Any, AsyncGenerator, Optional
4+
from typing import Any, AsyncGenerator, Awaitable, Optional, Union, cast
55

6-
from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
6+
from openai import AsyncStream
7+
from openai.types.chat import (
8+
ChatCompletion,
9+
ChatCompletionChunk,
10+
ChatCompletionMessageParam,
11+
)
712

8-
from approaches.approach import Approach
13+
from approaches.approach import (
14+
Approach,
15+
ExtraInfo,
16+
)
917

1018

1119
class ChatApproach(Approach, ABC):
1220

1321
NO_RESPONSE = "0"
1422

1523
@abstractmethod
16-
async def run_until_final_call(self, messages, overrides, auth_claims, should_stream) -> tuple:
24+
async def run_until_final_call(
25+
self, messages, overrides, auth_claims, should_stream
26+
) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]:
1727
pass
1828

1929
def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
@@ -49,12 +59,15 @@ async def run_without_streaming(
4959
extra_info, chat_coroutine = await self.run_until_final_call(
5060
messages, overrides, auth_claims, should_stream=False
5161
)
52-
chat_completion_response: ChatCompletion = await chat_coroutine
62+
chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine)
5363
content = chat_completion_response.choices[0].message.content
5464
role = chat_completion_response.choices[0].message.role
5565
if overrides.get("suggest_followup_questions"):
5666
content, followup_questions = self.extract_followup_questions(content)
57-
extra_info["followup_questions"] = followup_questions
67+
extra_info.followup_questions = followup_questions
68+
# Assume last thought is for generating answer
69+
if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage:
70+
extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage)
5871
chat_app_response = {
5972
"message": {"content": content, "role": role},
6073
"context": extra_info,
@@ -72,6 +85,7 @@ async def run_with_streaming(
7285
extra_info, chat_coroutine = await self.run_until_final_call(
7386
messages, overrides, auth_claims, should_stream=True
7487
)
88+
chat_coroutine = cast(Awaitable[AsyncStream[ChatCompletionChunk]], chat_coroutine)
7589
yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
7690

7791
followup_questions_started = False
@@ -80,6 +94,7 @@ async def run_with_streaming(
8094
# "2023-07-01-preview" API version has a bug where first response has empty choices
8195
event = event_chunk.model_dump() # Convert pydantic model to dict
8296
if event["choices"]:
97+
# No usage during streaming
8398
completion = {
8499
"delta": {
85100
"content": event["choices"][0]["delta"].get("content"),
@@ -100,9 +115,19 @@ async def run_with_streaming(
100115
followup_content += content
101116
else:
102117
yield completion
118+
else:
119+
# Final chunk at end of streaming should contain usage
120+
# https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response
121+
if event_chunk.usage and extra_info.thoughts and self.include_token_usage:
122+
extra_info.thoughts[-1].update_token_usage(event_chunk.usage)
123+
yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
124+
103125
if followup_content:
104126
_, followup_questions = self.extract_followup_questions(followup_content)
105-
yield {"delta": {"role": "assistant"}, "context": {"followup_questions": followup_questions}}
127+
yield {
128+
"delta": {"role": "assistant"},
129+
"context": {"context": extra_info, "followup_questions": followup_questions},
130+
}
106131

107132
async def run(
108133
self,

0 commit comments

Comments
 (0)