Skip to content

Commit 2ee850f

Browse files
committed
Blob manager improvements/tests
1 parent 43c9eac commit 2ee850f

File tree

8 files changed

+146
-120
lines changed

8 files changed

+146
-120
lines changed

app/backend/app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ async def setup_clients():
561561
openai_organization=OPENAI_ORGANIZATION,
562562
)
563563

564-
user_image_blob_manager = None
564+
user_blob_manager = None
565565
if USE_USER_UPLOAD:
566566
current_app.logger.info("USE_USER_UPLOAD is true, setting up user upload feature")
567567
if not AZURE_USERSTORAGE_ACCOUNT or not AZURE_USERSTORAGE_CONTAINER:
@@ -616,7 +616,7 @@ async def setup_clients():
616616
embeddings=text_embeddings_service,
617617
image_embeddings=image_embeddings_service,
618618
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
619-
blob_manager=user_image_blob_manager,
619+
blob_manager=user_blob_manager,
620620
)
621621
current_app.config[CONFIG_INGESTER] = ingester
622622

app/backend/approaches/approach.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -418,8 +418,8 @@ async def download_blob_as_base64(self, blob_url: str, user_oid: Optional[str] =
418418
if ".dfs.core.windows.net" in blob_url and self.user_blob_manager:
419419
blob_manager = self.user_blob_manager
420420
blob_downloader = await blob_manager.download_blob(blob_path, user_oid=user_oid)
421-
blob = await blob_downloader.readall()
422-
if blob is not None:
421+
if blob_downloader is not None:
422+
blob = await blob_downloader.readall()
423423
img = base64.b64encode(blob).decode("utf-8")
424424
return f"data:image/png;base64,{img}"
425425
return None

app/backend/prepdocslib/blobmanager.py

Lines changed: 81 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ async def upload_blob(self, file: Union[File, IO], filename: str, user_oid: str)
179179
str: The URL of the uploaded file, with forward slashes (not URL-encoded)
180180
"""
181181
# Ensure user directory exists but don't create a subdirectory
182-
user_directory_client = await self._ensure_directory(user_oid, owner=user_oid)
182+
user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
183183

184184
# Create file directly in user directory
185185
file_client = user_directory_client.get_file_client(filename)
@@ -240,12 +240,12 @@ async def upload_document_image(
240240
Returns:
241241
str: The URL of the uploaded file, with forward slashes (not URL-encoded)
242242
"""
243-
await self._ensure_directory(user_oid, owner=user_oid)
244-
directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num)
245-
image_directory_client = await self._ensure_directory(directory_path, owner=user_oid)
243+
await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
244+
image_directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num)
245+
image_directory_client = await self._ensure_directory(directory_path=image_directory_path, user_oid=user_oid)
246246
file_client = image_directory_client.get_file_client(image_filename)
247247
image_bytes = BaseBlobManager.add_image_citation(image_bytes, document_filename, image_filename, image_page_num)
248-
logger.info("Uploading document image '%s' to '%s'", image_filename, directory_path)
248+
logger.info("Uploading document image '%s' to '%s'", image_filename, image_directory_path)
249249
await file_client.upload_data(image_bytes, overwrite=True, metadata={"UploadedBy": user_oid})
250250
return unquote(file_client.url)
251251

@@ -284,7 +284,7 @@ async def download_blob(
284284
filename = path_parts[-1]
285285

286286
try:
287-
user_directory_client = self._ensure_directory(directory_path, user_oid)
287+
user_directory_client = self._ensure_directory(directory_path=directory_path, user_oid=user_oid)
288288
file_client = user_directory_client.get_file_client(filename)
289289
blob = await file_client.download_file()
290290
return blob
@@ -310,15 +310,17 @@ async def remove_blob(self, filename: str, user_oid: str) -> None:
310310
ResourceNotFoundError: If the file does not exist
311311
"""
312312
# Ensure the user directory exists
313-
user_directory_client = await self._ensure_directory(user_oid, owner=user_oid)
313+
user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
314314
# Delete the main document file
315315
file_client = user_directory_client.get_file_client(filename)
316316
await file_client.delete_file()
317317

318318
# Try to delete any associated image directories
319319
try:
320320
image_directory_path = self._get_image_directory_path(filename, user_oid)
321-
image_directory_client = await self._ensure_directory(image_directory_path, owner=user_oid)
321+
image_directory_client = await self._ensure_directory(
322+
directory_path=image_directory_path, user_oid=user_oid
323+
)
322324
await image_directory_client.delete_directory()
323325
logger.info(f"Deleted associated image directory: {image_directory_path}")
324326
except ResourceNotFoundError:
@@ -338,7 +340,7 @@ async def list_blobs(self, user_oid: str) -> list[str]:
338340
Returns:
339341
list[str]: List of filenames that belong to the user
340342
"""
341-
user_directory_client = await self._ensure_directory(user_oid, owner=user_oid)
343+
user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
342344
files = []
343345
try:
344346
all_paths = user_directory_client.get_paths()
@@ -389,21 +391,30 @@ def __init__(
389391
self.subscription_id = subscription_id
390392
self.image_container = image_container
391393

392-
async def upload_blob(self, file: File) -> Optional[list[str]]:
393-
async with BlobServiceClient(
394+
def get_managedidentity_connectionstring(self):
395+
if not self.account or not self.resource_group or not self.subscription_id:
396+
raise ValueError("Account, resource group, and subscription ID must be set to generate connection string.")
397+
return f"ResourceId=/subscriptions/{self.subscription_id}/resourceGroups/{self.resource_group}/providers/Microsoft.Storage/storageAccounts/{self.account};"
398+
399+
async def _get_service_client(self) -> BlobServiceClient:
400+
return BlobServiceClient(
394401
account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
395-
) as service_client, service_client.get_container_client(self.container) as container_client:
396-
if not await container_client.exists():
397-
await container_client.create_container()
398-
399-
# Re-open and upload the original file
400-
if file.url is None:
401-
with open(file.content.name, "rb") as reopened_file:
402-
blob_name = self.blob_name_from_file_name(file.content.name)
403-
logger.info("Uploading blob for document '%s'", blob_name)
404-
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
405-
file.url = blob_client.url
406-
return None
402+
)
403+
404+
async def upload_blob(self, file: File) -> Optional[list[str]]:
405+
blob_service_client = await self._get_service_client()
406+
container_client = blob_service_client.get_container_client(self.container)
407+
if not await container_client.exists():
408+
await container_client.create_container()
409+
410+
# Re-open and upload the original file
411+
if file.url is None:
412+
with open(file.content.name, "rb") as reopened_file:
413+
blob_name = self.blob_name_from_file_name(file.content.name)
414+
logger.info("Uploading blob for document '%s'", blob_name)
415+
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
416+
file.url = blob_client.url
417+
blob_service_client.close()
407418

408419
async def upload_document_image(
409420
self,
@@ -421,22 +432,16 @@ async def upload_document_image(
421432
raise ValueError(
422433
"user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations."
423434
)
424-
async with BlobServiceClient(
425-
account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
426-
) as service_client, service_client.get_container_client(self.image_container) as container_client:
427-
if not await container_client.exists():
428-
await container_client.create_container()
429-
image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num)
430-
blob_name = f"{self.blob_name_from_file_name(document_filename)}/page{image_page_num}/{image_filename}"
431-
logger.info("Uploading blob for document image '%s'", blob_name)
432-
blob_client = await container_client.upload_blob(blob_name, image_bytes, overwrite=True)
433-
return blob_client.url
434-
return None
435-
436-
def get_managedidentity_connectionstring(self):
437-
if not self.account or not self.resource_group or not self.subscription_id:
438-
raise ValueError("Account, resource group, and subscription ID must be set to generate connection string.")
439-
return f"ResourceId=/subscriptions/{self.subscription_id}/resourceGroups/{self.resource_group}/providers/Microsoft.Storage/storageAccounts/{self.account};"
435+
blob_service_client = await self._get_service_client()
436+
container_client = blob_service_client.get_container_client(self.container)
437+
if not await container_client.exists():
438+
await container_client.create_container()
439+
image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num)
440+
blob_name = f"{self.blob_name_from_file_name(document_filename)}/page{image_page_num}/{image_filename}"
441+
logger.info("Uploading blob for document image '%s'", blob_name)
442+
blob_client = await container_client.upload_blob(blob_name, image_bytes, overwrite=True)
443+
blob_service_client.close()
444+
return blob_client.url
440445

441446
async def download_blob(
442447
self, blob_path: str, user_oid: Optional[str] = None
@@ -445,43 +450,43 @@ async def download_blob(
445450
raise ValueError(
446451
"user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations."
447452
)
448-
449-
async with BlobServiceClient(
450-
account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
451-
) as service_client, service_client.get_container_client(self.container) as container_client:
452-
if not await container_client.exists():
453-
return None
454-
blob_client = container_client.get_blob_client(blob_path)
455-
try:
456-
blob = await blob_client.download_blob()
457-
if not blob.properties:
458-
logger.warning(f"No blob exists for {blob_path}")
459-
return None
460-
return blob
461-
except ResourceNotFoundError:
462-
logger.warning("Blob not found: %s", blob_path)
453+
blob_service_client = await self._get_service_client()
454+
container_client = blob_service_client.get_container_client(self.container)
455+
if not await container_client.exists():
456+
return None
457+
if len(blob_path) == 0:
458+
logger.warning("Blob path is empty")
459+
return None
460+
blob_client = container_client.get_blob_client(blob_path)
461+
try:
462+
blob = await blob_client.download_blob()
463+
if not blob.properties:
464+
logger.warning(f"No blob exists for {blob_path}")
463465
return None
466+
return blob
467+
except ResourceNotFoundError:
468+
logger.warning("Blob not found: %s", blob_path)
469+
return None
470+
# TODO: close client properly
464471

465472
async def remove_blob(self, path: Optional[str] = None):
466-
async with BlobServiceClient(
467-
account_url=self.endpoint, credential=self.credential
468-
) as service_client, service_client.get_container_client(self.container) as container_client:
469-
if not await container_client.exists():
470-
return
471-
if path is None:
472-
prefix = None
473-
blobs = container_client.list_blob_names()
474-
else:
475-
prefix = os.path.splitext(os.path.basename(path))[0]
476-
blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0])
477-
async for blob_path in blobs:
478-
# This still supports PDFs split into individual pages, but we could remove in future to simplify code
479-
if (
480-
prefix is not None
481-
and (
482-
not re.match(rf"{prefix}-\d+\.pdf", blob_path) or not re.match(rf"{prefix}-\d+\.png", blob_path)
483-
)
484-
) or (path is not None and blob_path == os.path.basename(path)):
485-
continue
486-
logger.info("Removing blob %s", blob_path)
487-
await container_client.delete_blob(blob_path)
473+
blob_service_client = await self._get_service_client()
474+
container_client = blob_service_client.get_container_client(self.container)
475+
if not await container_client.exists():
476+
return
477+
if path is None:
478+
prefix = None
479+
blobs = container_client.list_blob_names()
480+
else:
481+
prefix = os.path.splitext(os.path.basename(path))[0]
482+
blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0])
483+
async for blob_path in blobs:
484+
# This still supports PDFs split into individual pages, but we could remove in future to simplify code
485+
if (
486+
prefix is not None
487+
and (not re.match(rf"{prefix}-\d+\.pdf", blob_path) or not re.match(rf"{prefix}-\d+\.png", blob_path))
488+
) or (path is not None and blob_path == os.path.basename(path)):
489+
continue
490+
logger.info("Removing blob %s", blob_path)
491+
await container_client.delete_blob(blob_path)
492+
blob_service_client.close()

tests/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,14 @@
2929

3030
import app
3131
import core
32+
from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach
33+
from approaches.promptmanager import PromptyManager
3234
from core.authentication import AuthenticationHelper
35+
from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
3336

3437
from .mocks import (
38+
MOCK_EMBEDDING_DIMENSIONS,
39+
MOCK_EMBEDDING_MODEL_NAME,
3540
MockAsyncPageIterator,
3641
MockAsyncSearchResultsIterator,
3742
MockAzureCredential,
@@ -1058,3 +1063,37 @@ def mock_get_bearer_token(*args, **kwargs):
10581063
return mock_token_provider
10591064

10601065
monkeypatch.setattr("azure.identity.aio.get_bearer_token_provider", mock_get_bearer_token)
1066+
1067+
1068+
@pytest.fixture
1069+
def chat_approach():
1070+
return ChatReadRetrieveReadApproach(
1071+
search_client=None,
1072+
search_index_name=None,
1073+
agent_model=None,
1074+
agent_deployment=None,
1075+
agent_client=None,
1076+
auth_helper=None,
1077+
openai_client=None,
1078+
chatgpt_model="gpt-4.1-mini",
1079+
chatgpt_deployment="chat",
1080+
embedding_deployment="embeddings",
1081+
embedding_model=MOCK_EMBEDDING_MODEL_NAME,
1082+
embedding_dimensions=MOCK_EMBEDDING_DIMENSIONS,
1083+
embedding_field="embedding3",
1084+
sourcepage_field="",
1085+
content_field="",
1086+
query_language="en-us",
1087+
query_speller="lexicon",
1088+
prompt_manager=PromptyManager(),
1089+
user_blob_manager=AdlsBlobManager(
1090+
endpoint="https://test-userstorage-account.dfs.core.windows.net",
1091+
container="test-userstorage-container",
1092+
credential=MockAzureCredential(),
1093+
),
1094+
global_blob_manager=BlobManager( # on normal Azure storage
1095+
endpoint="https://test-globalstorage-account.blob.core.windows.net",
1096+
container="test-globalstorage-container",
1097+
credential=MockAzureCredential(),
1098+
),
1099+
)

tests/snapshots/test_app/test_chat_vision/client0/result.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"Financial Market Analysis Report 2023.pdf#page=2"
99
],
1010
"images": [
11-
""
11+
""
1212
],
1313
"text": [
1414
"Financial Market Analysis Report 2023.pdf#page=7: This section examines the correlations between stock indices, cryptocurrency prices, and commodity prices, revealing how changes in one market can have ripple effects across the financial ecosystem.### Impact of Macroeconomic Factors <figure><figcaption>Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets<br>The image is a line graph titled \"on Financial Markets\" displaying data from 2018 to 2023. It tracks three variables: Interest Rates %, Inflation Data %, and GDP Growth %, each represented by a different colored line (blue for Interest Rates, orange for Inflation Data, and gray for GDP Growth). Interest Rates % start around 2% in 2018, dip to about 0.25% in 2021, then rise to 1.5% in 2023. Inflation Data % begin at approximately 1.9% in 2018, rise to a peak near 3.4% in 2022, and then decrease to 2.5% in 2023. GDP Growth % shows significant fluctuations, starting at 3% in 2018, plunging to almost -4% in 2020, then rebounding to around 4.5% in 2021 before gradually declining to around 2.8% in 2023.</figcaption></figure> Macroeconomic factors such as interest rates, inflation, and GDP growth play a pivotal role in shaping financial markets.",
@@ -156,7 +156,7 @@
156156
},
157157
{
158158
"image_url": {
159-
"url": ""
159+
"url": ""
160160
},
161161
"type": "image_url"
162162
},

tests/test_app.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,15 @@ async def test_chat_stream_followup(client, snapshot):
10271027

10281028

10291029
@pytest.mark.asyncio
1030-
async def test_chat_vision(vision_client, snapshot):
1030+
async def test_chat_vision(monkeypatch, vision_client, snapshot):
1031+
async def mock_download_blob_as_base64(self, blob_url, user_oid):
1032+
return ""
1033+
1034+
monkeypatch.setattr(
1035+
"approaches.chatreadretrieveread.ChatReadRetrieveReadApproach.download_blob_as_base64",
1036+
mock_download_blob_as_base64,
1037+
)
1038+
10311039
response = await vision_client.post(
10321040
"/chat",
10331041
json={"messages": [{"content": "Are interest rates high?", "role": "user"}]},

tests/test_chatapproach.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,30 +25,6 @@ async def mock_retrieval(*args, **kwargs):
2525
return mock_retrieval_response()
2626

2727

28-
@pytest.fixture
29-
def chat_approach():
30-
return ChatReadRetrieveReadApproach(
31-
search_client=None,
32-
search_index_name=None,
33-
agent_model=None,
34-
agent_deployment=None,
35-
agent_client=None,
36-
auth_helper=None,
37-
openai_client=None,
38-
chatgpt_model="gpt-4.1-mini",
39-
chatgpt_deployment="chat",
40-
embedding_deployment="embeddings",
41-
embedding_model=MOCK_EMBEDDING_MODEL_NAME,
42-
embedding_dimensions=MOCK_EMBEDDING_DIMENSIONS,
43-
embedding_field="embedding3",
44-
sourcepage_field="",
45-
content_field="",
46-
query_language="en-us",
47-
query_speller="lexicon",
48-
prompt_manager=PromptyManager(),
49-
)
50-
51-
5228
def test_get_search_query(chat_approach):
5329
payload = """
5430
{

0 commit comments

Comments
 (0)