Skip to content

Commit 43c9eac

Browse files
committed
Working on tests
1 parent 5b17932 commit 43c9eac

File tree

10 files changed

+260
-231
lines changed

10 files changed

+260
-231
lines changed

app/backend/app.py

Lines changed: 29 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@
2626
from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
2727
from azure.search.documents.aio import SearchClient
2828
from azure.search.documents.indexes.aio import SearchIndexClient
29-
from azure.storage.blob.aio import ContainerClient
3029
from azure.storage.blob.aio import StorageStreamDownloader as BlobDownloader
31-
from azure.storage.filedatalake.aio import FileSystemClient
3230
from azure.storage.filedatalake.aio import StorageStreamDownloader as DatalakeDownloader
3331
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
3432
from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware
@@ -59,14 +57,12 @@
5957
CONFIG_AGENTIC_RETRIEVAL_ENABLED,
6058
CONFIG_ASK_APPROACH,
6159
CONFIG_AUTH_CLIENT,
62-
CONFIG_BLOB_CONTAINER_CLIENT,
6360
CONFIG_CHAT_APPROACH,
6461
CONFIG_CHAT_HISTORY_BROWSER_ENABLED,
6562
CONFIG_CHAT_HISTORY_COSMOS_ENABLED,
6663
CONFIG_CREDENTIAL,
6764
CONFIG_DEFAULT_REASONING_EFFORT,
68-
CONFIG_IMAGE_BLOB_CONTAINER_CLIENT, # Added this line
69-
CONFIG_IMAGE_DATALAKE_CLIENT,
65+
CONFIG_GLOBAL_BLOB_MANAGER,
7066
CONFIG_INGESTER,
7167
CONFIG_LANGUAGE_PICKER_ENABLED,
7268
CONFIG_MULTIMODAL_ENABLED,
@@ -87,7 +83,7 @@
8783
CONFIG_SPEECH_SERVICE_TOKEN,
8884
CONFIG_SPEECH_SERVICE_VOICE,
8985
CONFIG_STREAMING_ENABLED,
90-
CONFIG_USER_BLOB_CONTAINER_CLIENT,
86+
CONFIG_USER_BLOB_MANAGER,
9187
CONFIG_USER_UPLOAD_ENABLED,
9288
CONFIG_VECTOR_SEARCH_ENABLED,
9389
)
@@ -104,7 +100,7 @@
104100
setup_openai_client,
105101
setup_search_info,
106102
)
107-
from prepdocslib.blobmanager import AdlsBlobManager
103+
from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
108104
from prepdocslib.embeddings import ImageEmbeddings
109105
from prepdocslib.filestrategy import UploadUserFileStrategy
110106
from prepdocslib.listfilestrategy import File
@@ -153,19 +149,16 @@ async def content_file(path: str, auth_claims: dict[str, Any]):
153149
path_parts = path.rsplit("#page=", 1)
154150
path = path_parts[0]
155151
current_app.logger.info("Opening file %s", path)
156-
blob_container_client: ContainerClient = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
152+
blob_manager: BlobManager = current_app.config[CONFIG_GLOBAL_BLOB_MANAGER]
157153
blob: Union[BlobDownloader, DatalakeDownloader]
158-
try:
159-
blob = await blob_container_client.get_blob_client(path).download_blob()
160-
except ResourceNotFoundError:
154+
blob = await blob_manager.download_blob(path)
155+
if blob is None:
161156
current_app.logger.info("Path not found in general Blob container: %s", path)
162157
if current_app.config[CONFIG_USER_UPLOAD_ENABLED]:
163158
try:
164159
user_oid = auth_claims["oid"]
165-
user_blob_container_client = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
166-
user_directory_client: FileSystemClient = user_blob_container_client.get_directory_client(user_oid)
167-
file_client = user_directory_client.get_file_client(path)
168-
blob = await file_client.download_file()
160+
user_blob_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
161+
blob = await user_blob_manager.download_blob(path, user_oid=user_oid)
169162
except ResourceNotFoundError:
170163
current_app.logger.exception("Path not found in DataLake: %s", path)
171164
abort(404)
@@ -364,7 +357,7 @@ async def upload(auth_claims: dict[str, Any]):
364357

365358
user_oid = auth_claims["oid"]
366359
file = request_files.getlist("file")[0]
367-
adls_manager = AdlsBlobManager(current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT])
360+
adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
368361
file_url = await adls_manager.upload_blob(file, file.filename, user_oid)
369362
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
370363
await ingester.add_file(File(content=file, url=file_url, acls={"oids": [user_oid]}), user_oid=user_oid)
@@ -377,9 +370,9 @@ async def delete_uploaded(auth_claims: dict[str, Any]):
377370
request_json = await request.get_json()
378371
filename = request_json.get("filename")
379372
user_oid = auth_claims["oid"]
380-
adls_manager = AdlsBlobManager(current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT])
373+
adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
381374
await adls_manager.remove_blob(filename, user_oid)
382-
ingester = current_app.config[CONFIG_INGESTER]
375+
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
383376
await ingester.remove_file(filename, user_oid)
384377
return jsonify({"message": f"File {filename} deleted successfully"}), 200
385378

@@ -391,7 +384,7 @@ async def list_uploaded(auth_claims: dict[str, Any]):
391384
Only returns files directly in the user's directory, not in subdirectories.
392385
Excludes image files and the images directory."""
393386
user_oid = auth_claims["oid"]
394-
adls_manager = AdlsBlobManager(current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT])
387+
adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
395388
files = await adls_manager.list_blobs(user_oid)
396389
return jsonify(files), 200
397390

@@ -514,18 +507,14 @@ async def setup_clients():
514507
endpoint=AZURE_SEARCH_ENDPOINT, agent_name=AZURE_SEARCH_AGENT, credential=azure_credential
515508
)
516509

517-
blob_container_client = ContainerClient(
518-
f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", AZURE_STORAGE_CONTAINER, credential=azure_credential
510+
# Set up the global blob storage manager (used for global content/images, but not user uploads)
511+
global_blob_manager = BlobManager(
512+
endpoint=f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net",
513+
credential=azure_credential,
514+
container=AZURE_STORAGE_CONTAINER,
515+
image_container=AZURE_IMAGESTORAGE_CONTAINER,
519516
)
520-
521-
# Set up the image storage container client if configured
522-
image_blob_container_client = None
523-
if AZURE_IMAGESTORAGE_CONTAINER:
524-
image_blob_container_client = ContainerClient(
525-
f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net",
526-
AZURE_IMAGESTORAGE_CONTAINER,
527-
credential=azure_credential,
528-
)
517+
current_app.config[CONFIG_GLOBAL_BLOB_MANAGER] = global_blob_manager
529518

530519
# Set up authentication helper
531520
search_index = None
@@ -572,19 +561,19 @@ async def setup_clients():
572561
openai_organization=OPENAI_ORGANIZATION,
573562
)
574563

575-
user_blob_container_client = None
564+
user_image_blob_manager = None
576565
if USE_USER_UPLOAD:
577566
current_app.logger.info("USE_USER_UPLOAD is true, setting up user upload feature")
578567
if not AZURE_USERSTORAGE_ACCOUNT or not AZURE_USERSTORAGE_CONTAINER:
579568
raise ValueError(
580569
"AZURE_USERSTORAGE_ACCOUNT and AZURE_USERSTORAGE_CONTAINER must be set when USE_USER_UPLOAD is true"
581570
)
582-
user_blob_container_client = FileSystemClient(
583-
f"https://{AZURE_USERSTORAGE_ACCOUNT}.dfs.core.windows.net",
584-
AZURE_USERSTORAGE_CONTAINER,
571+
user_blob_manager = AdlsBlobManager(
572+
endpoint=f"https://{AZURE_USERSTORAGE_ACCOUNT}.dfs.core.windows.net",
573+
container=AZURE_USERSTORAGE_CONTAINER,
585574
credential=azure_credential,
586575
)
587-
current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT] = user_blob_container_client
576+
current_app.config[CONFIG_USER_BLOB_MANAGER] = user_blob_manager
588577

589578
# Set up ingester
590579
file_processors = setup_file_processors(
@@ -627,7 +616,7 @@ async def setup_clients():
627616
embeddings=text_embeddings_service,
628617
image_embeddings=image_embeddings_service,
629618
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
630-
blob_manager=AdlsBlobManager(user_blob_container_client),
619+
blob_manager=user_image_blob_manager,
631620
)
632621
current_app.config[CONFIG_INGESTER] = ingester
633622

@@ -638,9 +627,6 @@ async def setup_clients():
638627
current_app.config[CONFIG_OPENAI_CLIENT] = openai_client
639628
current_app.config[CONFIG_SEARCH_CLIENT] = search_client
640629
current_app.config[CONFIG_AGENT_CLIENT] = agent_client
641-
current_app.config[CONFIG_BLOB_CONTAINER_CLIENT] = blob_container_client
642-
if image_blob_container_client:
643-
current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT] = image_blob_container_client
644630
current_app.config[CONFIG_AUTH_CLIENT] = auth_helper
645631

646632
current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
@@ -695,8 +681,8 @@ async def setup_clients():
695681
reasoning_effort=OPENAI_REASONING_EFFORT,
696682
multimodal_enabled=USE_MULTIMODAL,
697683
image_embeddings_client=image_embeddings_client,
698-
image_blob_container_client=image_blob_container_client,
699-
image_datalake_client=user_blob_container_client,
684+
global_blob_manager=global_blob_manager,
685+
user_blob_manager=user_blob_manager,
700686
)
701687

702688
# ChatReadRetrieveReadApproach is used by /chat for multi-turn conversation
@@ -722,21 +708,14 @@ async def setup_clients():
722708
reasoning_effort=OPENAI_REASONING_EFFORT,
723709
multimodal_enabled=USE_MULTIMODAL,
724710
image_embeddings_client=image_embeddings_client,
725-
image_blob_container_client=image_blob_container_client,
726-
image_datalake_client=user_blob_container_client,
711+
global_blob_manager=global_blob_manager,
712+
user_blob_manager=user_blob_manager,
727713
)
728714

729715

730716
@bp.after_app_serving
731717
async def close_clients():
732718
await current_app.config[CONFIG_SEARCH_CLIENT].close()
733-
await current_app.config[CONFIG_BLOB_CONTAINER_CLIENT].close()
734-
if current_app.config.get(CONFIG_USER_BLOB_CONTAINER_CLIENT):
735-
await current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT].close()
736-
if current_app.config.get(CONFIG_IMAGE_BLOB_CONTAINER_CLIENT):
737-
await current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT].close()
738-
if current_app.config.get(CONFIG_IMAGE_DATALAKE_CLIENT):
739-
await current_app.config[CONFIG_IMAGE_DATALAKE_CLIENT].close()
740719

741720

742721
def create_app():

app/backend/approaches/approach.py

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
from abc import ABC
23
from collections.abc import AsyncGenerator, Awaitable
34
from dataclasses import dataclass, field
@@ -20,8 +21,6 @@
2021
VectorizedQuery,
2122
VectorQuery,
2223
)
23-
from azure.storage.blob.aio import ContainerClient
24-
from azure.storage.filedatalake.aio import FileSystemClient
2524
from openai import AsyncOpenAI, AsyncStream
2625
from openai.types import CompletionUsage
2726
from openai.types.chat import (
@@ -34,7 +33,7 @@
3433

3534
from approaches.promptmanager import PromptManager
3635
from core.authentication import AuthenticationHelper
37-
from core.imageshelper import download_blob_as_base64
36+
from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
3837
from prepdocslib.embeddings import ImageEmbeddings
3938

4039

@@ -161,8 +160,8 @@ def __init__(
161160
reasoning_effort: Optional[str] = None,
162161
multimodal_enabled: bool = False,
163162
image_embeddings_client: Optional[ImageEmbeddings] = None,
164-
image_blob_container_client: Optional[ContainerClient] = None,
165-
image_datalake_client: Optional[FileSystemClient] = None,
163+
global_blob_manager: Optional[BlobManager] = None,
164+
user_blob_manager: Optional[AdlsBlobManager] = None,
166165
):
167166
self.search_client = search_client
168167
self.openai_client = openai_client
@@ -179,23 +178,8 @@ def __init__(
179178
self.include_token_usage = True
180179
self.multimodal_enabled = multimodal_enabled
181180
self.image_embeddings_client = image_embeddings_client
182-
self.image_blob_container_client = image_blob_container_client
183-
self.image_datalake_client = image_datalake_client
184-
185-
def get_storage_client_for_url(self, url: str) -> Optional[Union[ContainerClient, FileSystemClient]]:
186-
"""
187-
Determines which storage client to use for a given URL.
188-
189-
Args:
190-
url: The URL or path of the image
191-
192-
Returns:
193-
Either the ContainerClient for Blob Storage or FileSystemClient for Data Lake Storage,
194-
based on the URL pattern. Returns None if no matching client is available.
195-
"""
196-
if ".dfs.core.windows.net" in url and self.image_datalake_client:
197-
return self.image_datalake_client
198-
return self.image_blob_container_client
181+
self.global_blob_manager = global_blob_manager
182+
self.user_blob_manager = user_blob_manager
199183

200184
def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
201185
include_category = overrides.get("include_category")
@@ -388,8 +372,7 @@ def nonewlines(s: str) -> str:
388372
if img["url"] in seen_urls or not img["url"]:
389373
continue
390374
seen_urls.add(img["url"])
391-
storage_client = self.get_storage_client_for_url(img["url"])
392-
url = await download_blob_as_base64(storage_client, img["url"], user_oid=user_oid)
375+
url = await self.download_blob_as_base64(img["url"], user_oid=user_oid)
393376
if url:
394377
image_sources.append(url)
395378
citations.append(self.get_image_citation(doc.sourcepage or "", img["url"]))
@@ -404,6 +387,43 @@ def get_image_citation(self, sourcepage: Optional[str], image_url: str):
404387
image_filename = image_url.split("/")[-1]
405388
return f"{sourcepage_citation}({image_filename})"
406389

390+
async def download_blob_as_base64(self, blob_url: str, user_oid: Optional[str] = None) -> Optional[str]:
391+
"""
392+
Downloads a blob from either Azure Blob Storage or Azure Data Lake Storage and returns it as a base64 encoded string.
393+
394+
Args:
395+
storage_client: Either a ContainerClient (for Blob Storage) or FileSystemClient (for Data Lake Storage)
396+
blob_url: The URL or path to the blob to download
397+
user_oid: The user's object ID, required for Data Lake Storage operations and access control
398+
399+
Returns:
400+
Optional[str]: The base64 encoded image data with data URI scheme prefix, or None if the blob cannot be downloaded
401+
"""
402+
403+
# Handle full URLs for both Blob Storage and Data Lake Storage
404+
if blob_url.startswith("http"):
405+
url_parts = blob_url.split("/")
406+
# Skip the domain parts and container/filesystem name to get the blob path
407+
# For blob: https://{account}.blob.core.windows.net/{container}/{blob_path}
408+
# For dfs: https://{account}.dfs.core.windows.net/{filesystem}/{path}
409+
blob_path = "/".join(url_parts[4:])
410+
# If %20 in URL, replace it with a space
411+
blob_path = blob_path.replace("%20", " ")
412+
else:
413+
# Treat as a direct blob path
414+
blob_path = blob_url
415+
416+
# Download the blob using the appropriate client
417+
blob_manager = self.global_blob_manager
418+
if ".dfs.core.windows.net" in blob_url and self.user_blob_manager:
419+
blob_manager = self.user_blob_manager
420+
blob_downloader = await blob_manager.download_blob(blob_path, user_oid=user_oid)
421+
blob = await blob_downloader.readall()
422+
if blob is not None:
423+
img = base64.b64encode(blob).decode("utf-8")
424+
return f"data:image/png;base64,{img}"
425+
return None
426+
407427
async def compute_text_embedding(self, q: str):
408428
SUPPORTED_DIMENSIONS_MODEL = {
409429
"text-embedding-ada-002": False,

app/backend/approaches/chatreadretrieveread.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
77
from azure.search.documents.aio import SearchClient
88
from azure.search.documents.models import VectorQuery
9-
from azure.storage.blob.aio import ContainerClient
10-
from azure.storage.filedatalake.aio import FileSystemClient
119
from openai import AsyncOpenAI, AsyncStream
1210
from openai.types.chat import (
1311
ChatCompletion,
@@ -24,6 +22,7 @@
2422
)
2523
from approaches.promptmanager import PromptManager
2624
from core.authentication import AuthenticationHelper
25+
from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
2726
from prepdocslib.embeddings import ImageEmbeddings
2827

2928

@@ -60,8 +59,8 @@ def __init__(
6059
reasoning_effort: Optional[str] = None,
6160
multimodal_enabled: bool = False,
6261
image_embeddings_client: Optional[ImageEmbeddings] = None,
63-
image_blob_container_client: Optional[ContainerClient] = None,
64-
image_datalake_client: Optional[FileSystemClient] = None,
62+
global_blob_manager: Optional[BlobManager] = None,
63+
user_blob_manager: Optional[AdlsBlobManager] = None,
6564
):
6665
self.search_client = search_client
6766
self.search_index_name = search_index_name
@@ -70,7 +69,6 @@ def __init__(
7069
self.agent_client = agent_client
7170
self.openai_client = openai_client
7271
self.auth_helper = auth_helper
73-
7472
self.chatgpt_model = chatgpt_model
7573
self.chatgpt_deployment = chatgpt_deployment
7674
self.embedding_deployment = embedding_deployment
@@ -89,8 +87,8 @@ def __init__(
8987
self.include_token_usage = True
9088
self.multimodal_enabled = multimodal_enabled
9189
self.image_embeddings_client = image_embeddings_client
92-
self.image_blob_container_client = image_blob_container_client
93-
self.image_datalake_client = image_datalake_client
90+
self.global_blob_manager = global_blob_manager
91+
self.user_blob_manager = user_blob_manager
9492

9593
def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
9694
response_message = chat_completion.choices[0].message

0 commit comments

Comments
 (0)