Skip to content

Commit 0d6e1ad

Browse files
committed
Progress on user upload support
1 parent 806828e commit 0d6e1ad

File tree

5 files changed

+256
-147
lines changed

5 files changed

+256
-147
lines changed

app/backend/app.py

Lines changed: 54 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from azure.storage.blob.aio import StorageStreamDownloader as BlobDownloader
3131
from azure.storage.filedatalake.aio import FileSystemClient
3232
from azure.storage.filedatalake.aio import StorageStreamDownloader as DatalakeDownloader
33-
from openai import AsyncAzureOpenAI, AsyncOpenAI
3433
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
3534
from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware
3635
from opentelemetry.instrumentation.httpx import (
@@ -99,8 +98,10 @@
9998
setup_embeddings_service,
10099
setup_file_processors,
101100
setup_image_embeddings_service,
101+
setup_openai_client,
102102
setup_search_info,
103103
)
104+
from prepdocslib.blobmanager import AdlsBlobManager
104105
from prepdocslib.filestrategy import UploadUserFileStrategy
105106
from prepdocslib.listfilestrategy import File
106107

@@ -358,22 +359,10 @@ async def upload(auth_claims: dict[str, Any]):
358359

359360
user_oid = auth_claims["oid"]
360361
file = request_files.getlist("file")[0]
361-
user_blob_container_client: FileSystemClient = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
362-
user_directory_client = user_blob_container_client.get_directory_client(user_oid)
363-
try:
364-
await user_directory_client.get_directory_properties()
365-
except ResourceNotFoundError:
366-
current_app.logger.info("Creating directory for user %s", user_oid)
367-
await user_directory_client.create_directory()
368-
await user_directory_client.set_access_control(owner=user_oid)
369-
file_client = user_directory_client.get_file_client(file.filename)
370-
file_io = file
371-
file_io.name = file.filename
372-
file_io = io.BufferedReader(file_io)
373-
await file_client.upload_data(file_io, overwrite=True, metadata={"UploadedBy": user_oid})
374-
file_io.seek(0)
362+
adls_manager = AdlsBlobManager(current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT])
363+
file_url = await adls_manager.upload_blob(file, file.filename, user_oid)
375364
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
376-
await ingester.add_file(File(content=file_io, acls={"oids": [user_oid]}, url=file_client.url))
365+
await ingester.add_file(File(content=file, url=file_url), user_oid=user_oid)
377366
return jsonify({"message": "File uploaded successfully"}), 200
378367

379368

@@ -395,16 +384,35 @@ async def delete_uploaded(auth_claims: dict[str, Any]):
395384
@bp.get("/list_uploaded")
396385
@authenticated
397386
async def list_uploaded(auth_claims: dict[str, Any]):
387+
"""Lists the uploaded documents for the current user.
388+
Only returns files directly in the user's directory, not in subdirectories.
389+
Excludes image files and the images directory."""
398390
user_oid = auth_claims["oid"]
399391
user_blob_container_client: FileSystemClient = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
400392
files = []
401393
try:
402394
all_paths = user_blob_container_client.get_paths(path=user_oid)
403395
async for path in all_paths:
404-
files.append(path.name.split("/", 1)[1])
396+
# Split path into parts (user_oid/filename or user_oid/directory/files)
397+
path_parts = path.name.split("/", 1)
398+
if len(path_parts) != 2:
399+
continue
400+
401+
filename = path_parts[1]
402+
# Only include files that are:
403+
# 1. Directly in the user's directory (no additional slashes)
404+
# 2. Not image files
405+
# 3. Not in a directory containing 'images'
406+
if (
407+
"/" not in filename
408+
and not any(filename.lower().endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp"])
409+
and "images" not in filename
410+
):
411+
files.append(filename)
405412
except ResourceNotFoundError as error:
406413
if error.status_code != 404:
407414
current_app.logger.exception("Error listing uploaded files", error)
415+
# Return empty list for 404 (no directory) as this is expected for new users
408416
return jsonify(files), 200
409417

410418

@@ -559,6 +567,29 @@ async def setup_clients():
559567
enable_unauthenticated_access=AZURE_ENABLE_UNAUTHENTICATED_ACCESS,
560568
)
561569

570+
if USE_SPEECH_OUTPUT_AZURE:
571+
current_app.logger.info("USE_SPEECH_OUTPUT_AZURE is true, setting up Azure speech service")
572+
if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "":
573+
raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID")
574+
if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "":
575+
raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION")
576+
current_app.config[CONFIG_SPEECH_SERVICE_ID] = AZURE_SPEECH_SERVICE_ID
577+
current_app.config[CONFIG_SPEECH_SERVICE_LOCATION] = AZURE_SPEECH_SERVICE_LOCATION
578+
current_app.config[CONFIG_SPEECH_SERVICE_VOICE] = AZURE_SPEECH_SERVICE_VOICE
579+
# Wait until token is needed to fetch for the first time
580+
current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = None
581+
582+
openai_client = setup_openai_client(
583+
openai_host=OPENAI_HOST,
584+
azure_credential=azure_credential,
585+
azure_openai_api_version=AZURE_OPENAI_API_VERSION,
586+
azure_openai_service=AZURE_OPENAI_SERVICE,
587+
azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL,
588+
azure_openai_api_key=AZURE_OPENAI_API_KEY_OVERRIDE,
589+
openai_api_key=OPENAI_API_KEY,
590+
openai_organization=OPENAI_ORGANIZATION,
591+
)
592+
562593
if USE_USER_UPLOAD:
563594
current_app.logger.info("USE_USER_UPLOAD is true, setting up user upload feature")
564595
if not AZURE_USERSTORAGE_ACCOUNT or not AZURE_USERSTORAGE_CONTAINER:
@@ -578,7 +609,12 @@ async def setup_clients():
578609
document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
579610
local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true",
580611
local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER", "").lower() == "true",
612+
use_content_understanding=os.getenv("USE_CONTENT_UNDERSTANDING", "").lower() == "true",
613+
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
581614
use_multimodal=USE_MULTIMODAL,
615+
openai_client=openai_client,
616+
openai_model=OPENAI_CHATGPT_MODEL,
617+
openai_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT if OPENAI_HOST == OpenAIHost.AZURE else None,
582618
)
583619
search_info = await setup_search_info(
584620
search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential
@@ -608,63 +644,10 @@ async def setup_clients():
608644
embeddings=text_embeddings_service,
609645
image_embeddings=image_embeddings_service,
610646
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
611-
blob_manager=user_blob_container_client,
647+
blob_manager=AdlsBlobManager(user_blob_container_client),
612648
)
613649
current_app.config[CONFIG_INGESTER] = ingester
614650

615-
# Used by the OpenAI SDK
616-
openai_client: AsyncOpenAI
617-
618-
if USE_SPEECH_OUTPUT_AZURE:
619-
current_app.logger.info("USE_SPEECH_OUTPUT_AZURE is true, setting up Azure speech service")
620-
if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "":
621-
raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID")
622-
if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "":
623-
raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION")
624-
current_app.config[CONFIG_SPEECH_SERVICE_ID] = AZURE_SPEECH_SERVICE_ID
625-
current_app.config[CONFIG_SPEECH_SERVICE_LOCATION] = AZURE_SPEECH_SERVICE_LOCATION
626-
current_app.config[CONFIG_SPEECH_SERVICE_VOICE] = AZURE_SPEECH_SERVICE_VOICE
627-
# Wait until token is needed to fetch for the first time
628-
current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = None
629-
630-
if OPENAI_HOST.startswith("azure"):
631-
if OPENAI_HOST == "azure_custom":
632-
current_app.logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client")
633-
if not AZURE_OPENAI_CUSTOM_URL:
634-
raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom")
635-
endpoint = AZURE_OPENAI_CUSTOM_URL
636-
else:
637-
current_app.logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client")
638-
if not AZURE_OPENAI_SERVICE:
639-
raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure")
640-
endpoint = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
641-
if AZURE_OPENAI_API_KEY_OVERRIDE:
642-
current_app.logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client")
643-
openai_client = AsyncAzureOpenAI(
644-
api_version=AZURE_OPENAI_API_VERSION, azure_endpoint=endpoint, api_key=AZURE_OPENAI_API_KEY_OVERRIDE
645-
)
646-
else:
647-
current_app.logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client")
648-
openai_client = AsyncAzureOpenAI(
649-
api_version=AZURE_OPENAI_API_VERSION,
650-
azure_endpoint=endpoint,
651-
azure_ad_token_provider=azure_ai_token_provider,
652-
)
653-
elif OPENAI_HOST == "local":
654-
current_app.logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key")
655-
openai_client = AsyncOpenAI(
656-
base_url=os.environ["OPENAI_BASE_URL"],
657-
api_key="no-key-required",
658-
)
659-
else:
660-
current_app.logger.info(
661-
"OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables"
662-
)
663-
openai_client = AsyncOpenAI(
664-
api_key=OPENAI_API_KEY,
665-
organization=OPENAI_ORGANIZATION,
666-
)
667-
668651
current_app.config[CONFIG_OPENAI_CLIENT] = openai_client
669652
current_app.config[CONFIG_SEARCH_CLIENT] = search_client
670653
current_app.config[CONFIG_AGENT_CLIENT] = agent_client

app/backend/prepdocs.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -436,10 +436,10 @@ async def main(strategy: Strategy, setup_index: bool = True):
436436
loop = asyncio.new_event_loop()
437437
asyncio.set_event_loop(loop)
438438

439-
openai_host = os.environ["OPENAI_HOST"]
439+
OPENAI_HOST = OpenAIHost(os.environ["OPENAI_HOST"])
440440
# Check for incompatibility
441441
# if openai host is not azure
442-
if openai_host != "azure" and use_agentic_retrieval:
442+
if use_agentic_retrieval and OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]:
443443
raise Exception("Agentic retrieval requires an Azure OpenAI chat completion service")
444444

445445
search_info = loop.run_until_complete(
@@ -475,15 +475,14 @@ async def main(strategy: Strategy, setup_index: bool = True):
475475
datalake_key=clean_key_if_exists(args.datalakekey),
476476
)
477477

478-
openai_host = OpenAIHost(os.environ["OPENAI_HOST"])
479478
# https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
480479
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01"
481480
emb_model_dimensions = 1536
482481
if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
483482
emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
484483
openai_embeddings_service = setup_embeddings_service(
485484
azure_credential=azd_credential,
486-
openai_host=openai_host,
485+
openai_host=OPENAI_HOST,
487486
emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
488487
emb_model_dimensions=emb_model_dimensions,
489488
azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
@@ -497,7 +496,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
497496
disable_batch_vectors=args.disablebatchvectors,
498497
)
499498
openai_client = setup_openai_client(
500-
openai_host=openai_host,
499+
openai_host=OPENAI_HOST,
501500
azure_credential=azd_credential,
502501
azure_openai_api_version=azure_openai_api_version,
503502
azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
@@ -543,7 +542,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
543542
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
544543
openai_client=openai_client,
545544
openai_model=os.getenv("AZURE_OPENAI_CHATGPT_MODEL"),
546-
openai_deployment=os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if openai_host == OpenAIHost.AZURE else None,
545+
openai_deployment=os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST == OpenAIHost.AZURE else None,
547546
)
548547

549548
image_embeddings_service = setup_image_embeddings_service(

0 commit comments

Comments
 (0)