Skip to content

Commit 783f61e

Browse files
committed
More mypy fixes
1 parent bab4350 commit 783f61e

File tree

5 files changed

+106
-47
lines changed

5 files changed

+106
-47
lines changed

app/backend/app.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
2626
from azure.search.documents.aio import SearchClient
2727
from azure.search.documents.indexes.aio import SearchIndexClient
28-
from azure.storage.blob.aio import StorageStreamDownloader as BlobDownloader
29-
from azure.storage.filedatalake.aio import StorageStreamDownloader as DatalakeDownloader
3028
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
3129
from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware
3230
from opentelemetry.instrumentation.httpx import (
@@ -149,24 +147,33 @@ async def content_file(path: str, auth_claims: dict[str, Any]):
149147
path = path_parts[0]
150148
current_app.logger.info("Opening file %s", path)
151149
blob_manager: BlobManager = current_app.config[CONFIG_GLOBAL_BLOB_MANAGER]
152-
blob: Union[BlobDownloader, DatalakeDownloader]
153-
blob = await blob_manager.download_blob(path)
154-
if blob is None:
150+
151+
# Get bytes and properties from the blob manager
152+
result = await blob_manager.download_blob(path)
153+
154+
if result is None:
155155
current_app.logger.info("Path not found in general Blob container: %s", path)
156156
if current_app.config[CONFIG_USER_UPLOAD_ENABLED]:
157157
user_oid = auth_claims["oid"]
158158
user_blob_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
159-
blob = await user_blob_manager.download_blob(path, user_oid=user_oid)
160-
if blob is None:
159+
result = await user_blob_manager.download_blob(path, user_oid=user_oid)
160+
if result is None:
161161
current_app.logger.exception("Path not found in DataLake: %s", path)
162-
if not blob or not blob.properties or not blob.properties.has_key("content_settings"):
162+
163+
if not result:
164+
abort(404)
165+
166+
content, properties = result
167+
168+
if not properties or "content_settings" not in properties:
163169
abort(404)
164-
mime_type = blob.properties["content_settings"]["content_type"]
170+
171+
mime_type = properties["content_settings"]["content_type"]
165172
if mime_type == "application/octet-stream":
166173
mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
167-
blob_file = io.BytesIO()
168-
await blob.readinto(blob_file)
169-
blob_file.seek(0)
174+
175+
# Create a BytesIO object from the bytes
176+
blob_file = io.BytesIO(content)
170177
return await send_file(blob_file, mimetype=mime_type, as_attachment=False, attachment_filename=path)
171178

172179

app/backend/approaches/approach.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,6 @@ async def download_blob_as_base64(self, blob_url: str, user_oid: Optional[str] =
392392
Downloads a blob from either Azure Blob Storage or Azure Data Lake Storage and returns it as a base64 encoded string.
393393
394394
Args:
395-
storage_client: Either a ContainerClient (for Blob Storage) or FileSystemClient (for Data Lake Storage)
396395
blob_url: The URL or path to the blob to download
397396
user_oid: The user's object ID, required for Data Lake Storage operations and access control
398397
@@ -414,13 +413,15 @@ async def download_blob_as_base64(self, blob_url: str, user_oid: Optional[str] =
414413
blob_path = blob_url
415414

416415
# Download the blob using the appropriate client
417-
blob_bytes = None
416+
result = None
418417
if ".dfs.core.windows.net" in blob_url and self.user_blob_manager:
419-
blob_bytes = await self.user_blob_manager.download_blob(blob_path, user_oid=user_oid, as_bytes=True)
418+
result = await self.user_blob_manager.download_blob(blob_path, user_oid=user_oid)
420419
elif self.global_blob_manager:
421-
blob_bytes = await self.global_blob_manager.download_blob(blob_path, as_bytes=True)
422-
if blob_bytes and isinstance(blob_bytes, (bytes)):
423-
img = base64.b64encode(blob_bytes).decode("utf-8")
420+
result = await self.global_blob_manager.download_blob(blob_path)
421+
422+
if result:
423+
content, _ = result # Unpack the tuple, ignoring properties
424+
img = base64.b64encode(content).decode("utf-8")
424425
return f"data:image/png;base64,{img}"
425426
return None
426427

app/backend/prepdocslib/blobmanager.py

Lines changed: 74 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,15 @@
33
import os
44
import re
55
from pathlib import Path
6-
from typing import IO, Optional, Union
6+
from typing import IO, Any, Optional, TypedDict, Union
77
from urllib.parse import unquote
88

99
from azure.core.credentials_async import AsyncTokenCredential
1010
from azure.core.exceptions import ResourceNotFoundError
1111
from azure.storage.blob.aio import BlobServiceClient
12-
from azure.storage.blob.aio import (
13-
StorageStreamDownloader as BlobStorageStreamDownloader,
14-
)
15-
from azure.storage.filedatalake.aio import DataLakeDirectoryClient, FileSystemClient
1612
from azure.storage.filedatalake.aio import (
17-
StorageStreamDownloader as AdlsBlobStorageStreamDownloader,
13+
DataLakeDirectoryClient,
14+
FileSystemClient,
1815
)
1916
from PIL import Image, ImageDraw, ImageFont
2017

@@ -23,6 +20,12 @@
2320
logger = logging.getLogger("scripts")
2421

2522

23+
class BlobProperties(TypedDict, total=False):
24+
"""Properties of a blob, with optional fields for content settings"""
25+
26+
content_settings: dict[str, Any]
27+
28+
2629
class BaseBlobManager:
2730
"""
2831
Base class for Azure Storage operations, providing common file naming and path utilities
@@ -97,13 +100,13 @@ async def upload_document_image(
97100
image_bytes: bytes,
98101
image_filename: str,
99102
image_page_num: int,
100-
user_oid: str,
103+
user_oid: Optional[str] = None,
101104
) -> Optional[str]:
102105
raise NotImplementedError("Subclasses must implement this method")
103106

104107
async def download_blob(
105-
self, blob_path: str, user_oid: Optional[str] = None, as_bytes: bool = False
106-
) -> Optional[Union[BlobStorageStreamDownloader, AdlsBlobStorageStreamDownloader, bytes]]:
108+
self, blob_path: str, user_oid: Optional[str] = None
109+
) -> Optional[tuple[bytes, BlobProperties]]:
107110
"""
108111
Downloads a blob from Azure Storage.
109112
If user_oid is provided, it checks if the blob belongs to the user.
@@ -113,7 +116,9 @@ async def download_blob(
113116
user_oid: The user's object ID (optional)
114117
115118
Returns:
116-
bytes: The content of the blob, or None if not found or access denied
119+
Optional[tuple[bytes, BlobProperties]]:
120+
- A tuple containing the blob content as bytes and the blob properties
121+
- None if blob not found or access denied
117122
"""
118123
raise NotImplementedError("Subclasses must implement this method")
119124

@@ -225,7 +230,7 @@ async def upload_document_image(
225230
image_bytes: bytes,
226231
image_filename: str,
227232
image_page_num: int,
228-
user_oid: str,
233+
user_oid: Optional[str] = None,
229234
) -> Optional[str]:
230235
"""
231236
Uploads an image from a document to ADLS in a directory structure:
@@ -242,6 +247,8 @@ async def upload_document_image(
242247
Returns:
243248
str: The URL of the uploaded file, with forward slashes (not URL-encoded)
244249
"""
250+
if user_oid is None:
251+
raise ValueError("user_oid must be provided for user-specific operations.")
245252
await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
246253
image_directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num)
247254
image_directory_client = await self._ensure_directory(directory_path=image_directory_path, user_oid=user_oid)
@@ -252,18 +259,19 @@ async def upload_document_image(
252259
return unquote(file_client.url)
253260

254261
async def download_blob(
255-
self, blob_path: str, user_oid: Optional[str] = None, as_bytes: bool = False
256-
) -> Optional[Union[AdlsBlobStorageStreamDownloader, bytes]]:
262+
self, blob_path: str, user_oid: Optional[str] = None
263+
) -> Optional[tuple[bytes, BlobProperties]]:
257264
"""
258265
Downloads a blob from Azure Data Lake Storage.
259266
260267
Args:
261268
blob_path: The path to the blob in the format {user_oid}/{document_name}/images/{image_name}
262269
user_oid: The user's object ID
263-
as_bytes: If True, returns the blob as bytes, otherwise returns a stream downloader
264270
265271
Returns:
266-
Optional[Union[AdlsBlobStorageStreamDownloader, bytes]]: A stream downloader for the blob, or bytes if as_bytes=True, or None if not found
272+
Optional[tuple[bytes, BlobProperties]]:
273+
- A tuple containing the blob content as bytes and the blob properties as a dictionary
274+
- None if blob not found or access denied
267275
"""
268276
if user_oid is None:
269277
logger.warning("user_oid must be provided for Data Lake Storage operations.")
@@ -289,11 +297,17 @@ async def download_blob(
289297
try:
290298
user_directory_client = await self._ensure_directory(directory_path=directory_path, user_oid=user_oid)
291299
file_client = user_directory_client.get_file_client(filename)
292-
blob = await file_client.download_file()
293-
if as_bytes:
294-
return await blob.readall()
295-
else:
296-
return blob
300+
download_response = await file_client.download_file()
301+
content = await download_response.readall()
302+
303+
# Convert FileProperties to our BlobProperties format
304+
properties: BlobProperties = {
305+
"content_settings": {
306+
"content_type": download_response.properties.get("content_type", "application/octet-stream")
307+
}
308+
}
309+
310+
return content, properties
297311
except ResourceNotFoundError:
298312
logger.warning(f"Directory or file not found: {directory_path}/{filename}")
299313
return None
@@ -449,8 +463,23 @@ async def upload_document_image(
449463
return blob_client.url
450464

451465
async def download_blob(
452-
self, blob_path: str, user_oid: Optional[str] = None, as_bytes: bool = False
453-
) -> Optional[Union[BlobStorageStreamDownloader, AdlsBlobStorageStreamDownloader, bytes]]:
466+
self, blob_path: str, user_oid: Optional[str] = None
467+
) -> Optional[tuple[bytes, BlobProperties]]:
468+
"""
469+
Downloads a blob from Azure Blob Storage.
470+
471+
Args:
472+
blob_path: The path to the blob in the storage
473+
user_oid: Not used in BlobManager, but included for API compatibility
474+
475+
Returns:
476+
Optional[tuple[bytes, BlobProperties]]:
477+
- A tuple containing the blob content as bytes and the blob properties
478+
- None if blob not found
479+
480+
Raises:
481+
ValueError: If user_oid is provided (not supported for BlobManager)
482+
"""
454483
if user_oid is not None:
455484
raise ValueError(
456485
"user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations."
@@ -461,16 +490,33 @@ async def download_blob(
461490
if len(blob_path) == 0:
462491
logger.warning("Blob path is empty")
463492
return None
493+
464494
blob_client = container_client.get_blob_client(blob_path)
465495
try:
466-
blob = await blob_client.download_blob()
467-
if not blob.properties:
496+
download_response = await blob_client.download_blob()
497+
if not download_response.properties:
468498
logger.warning(f"No blob exists for {blob_path}")
469499
return None
470-
if as_bytes:
471-
return await blob.readall()
472-
else:
473-
return blob
500+
501+
# Get the content as bytes
502+
content = await download_response.readall()
503+
504+
# Convert BlobProperties to our internal BlobProperties format
505+
properties: BlobProperties = {
506+
"content_settings": {
507+
"content_type": (
508+
download_response.properties.content_settings.content_type
509+
if (
510+
hasattr(download_response.properties, "content_settings")
511+
and download_response.properties.content_settings
512+
and hasattr(download_response.properties.content_settings, "content_type")
513+
)
514+
else "application/octet-stream"
515+
)
516+
}
517+
}
518+
519+
return content, properties
474520
except ResourceNotFoundError:
475521
logger.warning("Blob not found: %s", blob_path)
476522
return None

app/backend/prepdocslib/filestrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ async def run(self):
136136
await self.search_manager.remove_content()
137137

138138

139-
class UploadUserFileStrategy(FileStrategy):
139+
class UploadUserFileStrategy:
140140
"""
141141
Strategy for ingesting a file that has already been uploaded to a ADLS2 storage account
142142
"""

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
140140
return skillset
141141

142142
async def create_multimodal_skillset(self, index_name: str) -> SearchIndexerSkillset:
143+
if self.image_embeddings is None:
144+
raise ValueError("Image embeddings client must be provided for multimodal skillset creation.")
145+
if self.blob_manager.image_container is None:
146+
raise ValueError("Blob manager must have an image container set for multimodal skillset creation.")
147+
143148
document_layout_skill = DocumentIntelligenceLayoutSkill(
144149
description="Layout skill to read documents",
145150
context="/document",

0 commit comments

Comments
 (0)