Skip to content

Commit c4086c2

Browse files
committed
mypy fixes and reasoning fixes
1 parent 9223611 commit c4086c2

File tree

9 files changed

+64
-26
lines changed

9 files changed

+64
-26
lines changed

app/backend/approaches/approach.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ class Approach(ABC):
150150
# List of GPT reasoning models support
151151
GPT_REASONING_MODELS = {
152152
"o1": GPTReasoningModelSupport(streaming=False),
153+
"o3": GPTReasoningModelSupport(streaming=True),
153154
"o3-mini": GPTReasoningModelSupport(streaming=True),
155+
"o4-mini": GPTReasoningModelSupport(streaming=True),
154156
}
155157
# Set a higher token limit for GPT reasoning models
156158
RESPONSE_DEFAULT_TOKEN_LIMIT = 1024

app/backend/prepdocslib/mediadescriber.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import base64
22
import logging
33
from abc import ABC
4+
from typing import Optional
45

56
import aiohttp
67
from azure.core.credentials_async import AsyncTokenCredential
@@ -109,7 +110,7 @@ async def describe_image(self, image_bytes: bytes) -> str:
109110

110111

111112
class MultimodalModelDescriber(MediaDescriber):
112-
def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
113+
def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: Optional[str] = None):
113114
self.openai_client = openai_client
114115
self.model = model
115116
self.deployment = deployment

app/backend/prepdocslib/page.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
from dataclasses import dataclass, field
2+
from typing import Optional
23

34

45
@dataclass
56
class ImageOnPage:
67
bytes: bytes
7-
bbox: list[float, float, float, float] # Pixels
8+
bbox: tuple[float, float, float, float] # Pixels
89
filename: str
910
description: str
1011
figure_id: str
1112
page_num: int # 0-indexed
12-
url: str | None = None
13-
embedding: list[float] | None = None
13+
url: Optional[str] = None
14+
embedding: Optional[list[float]] = None
1415

1516

1617
@dataclass

app/backend/prepdocslib/pdfparser.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import html
22
import io
33
import logging
4+
import uuid
45
from collections.abc import AsyncGenerator
56
from enum import Enum
67
from typing import IO, Optional, Union
@@ -221,23 +222,27 @@ class ObjectType(Enum):
221222
offset += len(page_text)
222223

223224
@staticmethod
224-
async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber) -> str:
225+
async def process_figure(
226+
doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
227+
) -> ImageOnPage:
225228
figure_title = (figure.caption and figure.caption.content) or ""
226-
figure_filename = f"figure{figure.id.replace('.', '_')}.png"
229+
# Generate a random UUID if figure.id is None
230+
figure_id = figure.id or f"fig_{uuid.uuid4().hex[:8]}"
231+
figure_filename = f"figure{figure_id.replace('.', '_')}.png"
227232
logger.info(
228-
"Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__
233+
"Describing figure %s with title '%s' using %s", figure_id, figure_title, type(media_describer).__name__
229234
)
230235
if not figure.bounding_regions:
231236
return ImageOnPage(
232237
bytes=b"",
233238
page_num=0, # O-indexed
234-
figure_id=figure.id,
235-
bbox=[0, 0, 0, 0],
239+
figure_id=figure_id,
240+
bbox=(0, 0, 0, 0),
236241
filename=figure_filename,
237242
description=f"<figure><figcaption>{figure_title}</figcaption></figure>",
238243
)
239244
if len(figure.bounding_regions) > 1:
240-
logger.warning("Figure %s has more than one bounding region, using the first one", figure.id)
245+
logger.warning("Figure %s has more than one bounding region, using the first one", figure_id)
241246
first_region = figure.bounding_regions[0]
242247
# To learn more about bounding regions, see https://aka.ms/bounding-region
243248
bounding_box = (
@@ -252,7 +257,7 @@ async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_de
252257
return ImageOnPage(
253258
bytes=cropped_img,
254259
page_num=page_number - 1, # Convert to 0-indexed
255-
figure_id=figure.id,
260+
figure_id=figure_id,
256261
bbox=bbox_pixels,
257262
filename=figure_filename,
258263
description=f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>",
@@ -282,18 +287,18 @@ def table_to_html(table: DocumentTable):
282287
@staticmethod
283288
def crop_image_from_pdf_page(
284289
doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float]
285-
) -> tuple[bytes, list[float]]:
290+
) -> tuple[bytes, tuple[float, float, float, float]]:
286291
"""
287292
Crops a region from a given page in a PDF and returns it as an image.
288293
289294
:param pdf_path: Path to the PDF file.
290295
:param page_number: The page number to crop from (0-indexed).
291296
:param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches.
292-
:return: A PIL Image of the cropped area.
297+
:return: A tuple of (image_bytes, bbox_pixels).
293298
"""
294299
# Scale the bounding box to 72 DPI
295300
bbox_dpi = 72
296-
bbox_pixels = [x * bbox_dpi for x in bbox_inches]
301+
bbox_pixels = tuple(x * bbox_dpi for x in bbox_inches) # Convert to tuple
297302
rect = pymupdf.Rect(bbox_pixels)
298303
# Assume that the PDF has 300 DPI,
299304
# and use the matrix to convert between the 2 DPIs

docs/deploy_features.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,12 @@ This process does *not* delete your previous model deployment. If you want to de
135135

136136
## Using reasoning models
137137

138-
⚠️ This feature is not currently compatible with [multimodal feature](./multimodal.md). TODO: OR IS IT?
139-
140138
This feature allows you to use reasoning models to generate responses based on retrieved content. These models spend more time processing and understanding the user's request.
141139
To enable reasoning models, follow the steps in [the reasoning models guide](./reasoning.md).
142140
143141
## Using agentic retrieval
144142
145-
⚠️ This feature is not currently compatible with [multimodal feature](./multimodal.md). TODO: OR IS IT?
143+
⚠️ This feature is not fully compatible with [multimodal feature](./multimodal.md).
146144
147145
This feature allows you to use agentic retrieval in place of the Search API. To enable agentic retrieval, follow the steps in [the agentic retrieval guide](./agentic_retrieval.md)
148146

docs/multimodal.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,10 @@ For more details on how this feature works, read [this blog post](https://techco
104104

105105
You can also modify those settings in the "Developer Settings" in the chat UI,
106106
to experiment with different options before committing to them.
107+
108+
## Compatibility
109+
110+
* This feature is not fully compatible with the [agentic retrieval](./agentic_retrieval.md) feature.
111+
The agent *will* perform the multimodal vector embedding search, but it will not return images in the response,
112+
so we cannot send the images to the chat completion model.
113+
* This feature is compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support).

docs/reasoning.md

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,27 @@ This repository includes an optional feature that uses reasoning models to gener
1919

2020
Set the environment variables for your Azure OpenAI GPT deployments to your reasoning model
2121

22-
For o3-mini:
22+
For o4-mini:
2323

2424
```shell
25-
azd env set AZURE_OPENAI_CHATGPT_MODEL o3-mini
26-
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o3-mini
27-
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-01-31
25+
azd env set AZURE_OPENAI_CHATGPT_MODEL o4-mini
26+
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o4-mini
27+
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-04-16
2828
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU GlobalStandard
29-
azd env set AZURE_OPENAI_API_VERSION 2024-12-01-preview
29+
azd env set AZURE_OPENAI_API_VERSION 2025-04-01-preview
30+
```
31+
32+
For o3:
33+
34+
```shell
35+
azd env set AZURE_OPENAI_CHATGPT_MODEL o3
36+
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o3
37+
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-04-16
38+
azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU GlobalStandard
39+
azd env set AZURE_OPENAI_API_VERSION 2025-04-01-preview
3040
```
3141

32-
For o1:
42+
For o1: (No streaming support)
3343

3444
```shell
3545
azd env set AZURE_OPENAI_CHATGPT_MODEL o1

infra/main.bicep

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -703,14 +703,14 @@ module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if
703703
params: {
704704
name: !empty(computerVisionServiceName)
705705
? computerVisionServiceName
706-
: '${abbrs.cognitiveServicesComputerVision}${resourceToken}2'
707-
kind: 'AIServices'
706+
: '${abbrs.cognitiveServicesComputerVision}${resourceToken}cs'
707+
kind: 'CognitiveServices'
708708
networkAcls: {
709709
defaultAction: 'Allow'
710710
}
711711
customSubDomainName: !empty(computerVisionServiceName)
712712
? computerVisionServiceName
713-
: '${abbrs.cognitiveServicesComputerVision}${resourceToken}'
713+
: '${abbrs.cognitiveServicesComputerVision}${resourceToken}cs'
714714
location: computerVisionResourceGroupLocation
715715
tags: tags
716716
sku: 'S0'
@@ -1065,6 +1065,16 @@ module openAiRoleSearchService 'core/security/role.bicep' = if (isAzureOpenAiHos
10651065
}
10661066
}
10671067

1068+
module computerVisionRoleSearchService 'core/security/role.bicep' = if (useMultimodal) {
1069+
scope: computerVisionResourceGroup
1070+
name: 'computervision-role-searchservice'
1071+
params: {
1072+
principalId: searchService.outputs.principalId
1073+
roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908'
1074+
principalType: 'ServicePrincipal'
1075+
}
1076+
}
1077+
10681078
module storageRoleBackend 'core/security/role.bicep' = {
10691079
scope: storageResourceGroup
10701080
name: 'storage-role-backend'

todo.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ TODO:
99
* Test with integrated vectorization
1010
* Update all TODOs in the code/docs
1111

12+
1213
Decide:
1314
* In conftest, should I make a new env for vision? Currently I mashed it into the existing env, but it might be cleaner to have a separate one, as now I have to pass llm_inputs explicitly in the tests to turn off image responses.
1415
* LLMInputType and VectorFields have inconsistently named values
16+
17+
Later:
18+
Agentic: Incompatible since it doesnt retrieve images. We would need to do a follow-up search query to get each document, like filter: id eq 'x' or id eq 'y' or....

0 commit comments

Comments
 (0)