30
30
from azure .storage .blob .aio import StorageStreamDownloader as BlobDownloader
31
31
from azure .storage .filedatalake .aio import FileSystemClient
32
32
from azure .storage .filedatalake .aio import StorageStreamDownloader as DatalakeDownloader
33
- from openai import AsyncAzureOpenAI , AsyncOpenAI
34
33
from opentelemetry .instrumentation .aiohttp_client import AioHttpClientInstrumentor
35
34
from opentelemetry .instrumentation .asgi import OpenTelemetryMiddleware
36
35
from opentelemetry .instrumentation .httpx import (
99
98
setup_embeddings_service ,
100
99
setup_file_processors ,
101
100
setup_image_embeddings_service ,
101
+ setup_openai_client ,
102
102
setup_search_info ,
103
103
)
104
+ from prepdocslib .blobmanager import AdlsBlobManager
104
105
from prepdocslib .filestrategy import UploadUserFileStrategy
105
106
from prepdocslib .listfilestrategy import File
106
107
@@ -358,22 +359,10 @@ async def upload(auth_claims: dict[str, Any]):
358
359
359
360
user_oid = auth_claims ["oid" ]
360
361
file = request_files .getlist ("file" )[0 ]
361
- user_blob_container_client : FileSystemClient = current_app .config [CONFIG_USER_BLOB_CONTAINER_CLIENT ]
362
- user_directory_client = user_blob_container_client .get_directory_client (user_oid )
363
- try :
364
- await user_directory_client .get_directory_properties ()
365
- except ResourceNotFoundError :
366
- current_app .logger .info ("Creating directory for user %s" , user_oid )
367
- await user_directory_client .create_directory ()
368
- await user_directory_client .set_access_control (owner = user_oid )
369
- file_client = user_directory_client .get_file_client (file .filename )
370
- file_io = file
371
- file_io .name = file .filename
372
- file_io = io .BufferedReader (file_io )
373
- await file_client .upload_data (file_io , overwrite = True , metadata = {"UploadedBy" : user_oid })
374
- file_io .seek (0 )
362
+ adls_manager = AdlsBlobManager (current_app .config [CONFIG_USER_BLOB_CONTAINER_CLIENT ])
363
+ file_url = await adls_manager .upload_blob (file , file .filename , user_oid )
375
364
ingester : UploadUserFileStrategy = current_app .config [CONFIG_INGESTER ]
376
- await ingester .add_file (File (content = file_io , acls = { "oids" : [ user_oid ]}, url = file_client . url ) )
365
+ await ingester .add_file (File (content = file , url = file_url ), user_oid = user_oid )
377
366
return jsonify ({"message" : "File uploaded successfully" }), 200
378
367
379
368
@@ -395,16 +384,35 @@ async def delete_uploaded(auth_claims: dict[str, Any]):
395
384
@bp .get ("/list_uploaded" )
396
385
@authenticated
397
386
async def list_uploaded (auth_claims : dict [str , Any ]):
387
+ """Lists the uploaded documents for the current user.
388
+ Only returns files directly in the user's directory, not in subdirectories.
389
+ Excludes image files and the images directory."""
398
390
user_oid = auth_claims ["oid" ]
399
391
user_blob_container_client : FileSystemClient = current_app .config [CONFIG_USER_BLOB_CONTAINER_CLIENT ]
400
392
files = []
401
393
try :
402
394
all_paths = user_blob_container_client .get_paths (path = user_oid )
403
395
async for path in all_paths :
404
- files .append (path .name .split ("/" , 1 )[1 ])
396
+ # Split path into parts (user_oid/filename or user_oid/directory/files)
397
+ path_parts = path .name .split ("/" , 1 )
398
+ if len (path_parts ) != 2 :
399
+ continue
400
+
401
+ filename = path_parts [1 ]
402
+ # Only include files that are:
403
+ # 1. Directly in the user's directory (no additional slashes)
404
+ # 2. Not image files
405
+ # 3. Not in a directory containing 'images'
406
+ if (
407
+ "/" not in filename
408
+ and not any (filename .lower ().endswith (ext ) for ext in [".png" , ".jpg" , ".jpeg" , ".gif" , ".bmp" ])
409
+ and "images" not in filename
410
+ ):
411
+ files .append (filename )
405
412
except ResourceNotFoundError as error :
406
413
if error .status_code != 404 :
407
414
current_app .logger .exception ("Error listing uploaded files" , error )
415
+ # Return empty list for 404 (no directory) as this is expected for new users
408
416
return jsonify (files ), 200
409
417
410
418
@@ -559,6 +567,29 @@ async def setup_clients():
559
567
enable_unauthenticated_access = AZURE_ENABLE_UNAUTHENTICATED_ACCESS ,
560
568
)
561
569
570
+ if USE_SPEECH_OUTPUT_AZURE :
571
+ current_app .logger .info ("USE_SPEECH_OUTPUT_AZURE is true, setting up Azure speech service" )
572
+ if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "" :
573
+ raise ValueError ("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID" )
574
+ if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "" :
575
+ raise ValueError ("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION" )
576
+ current_app .config [CONFIG_SPEECH_SERVICE_ID ] = AZURE_SPEECH_SERVICE_ID
577
+ current_app .config [CONFIG_SPEECH_SERVICE_LOCATION ] = AZURE_SPEECH_SERVICE_LOCATION
578
+ current_app .config [CONFIG_SPEECH_SERVICE_VOICE ] = AZURE_SPEECH_SERVICE_VOICE
579
+ # Wait until token is needed to fetch for the first time
580
+ current_app .config [CONFIG_SPEECH_SERVICE_TOKEN ] = None
581
+
582
+ openai_client = setup_openai_client (
583
+ openai_host = OPENAI_HOST ,
584
+ azure_credential = azure_credential ,
585
+ azure_openai_api_version = AZURE_OPENAI_API_VERSION ,
586
+ azure_openai_service = AZURE_OPENAI_SERVICE ,
587
+ azure_openai_custom_url = AZURE_OPENAI_CUSTOM_URL ,
588
+ azure_openai_api_key = AZURE_OPENAI_API_KEY_OVERRIDE ,
589
+ openai_api_key = OPENAI_API_KEY ,
590
+ openai_organization = OPENAI_ORGANIZATION ,
591
+ )
592
+
562
593
if USE_USER_UPLOAD :
563
594
current_app .logger .info ("USE_USER_UPLOAD is true, setting up user upload feature" )
564
595
if not AZURE_USERSTORAGE_ACCOUNT or not AZURE_USERSTORAGE_CONTAINER :
@@ -578,7 +609,12 @@ async def setup_clients():
578
609
document_intelligence_service = os .getenv ("AZURE_DOCUMENTINTELLIGENCE_SERVICE" ),
579
610
local_pdf_parser = os .getenv ("USE_LOCAL_PDF_PARSER" , "" ).lower () == "true" ,
580
611
local_html_parser = os .getenv ("USE_LOCAL_HTML_PARSER" , "" ).lower () == "true" ,
612
+ use_content_understanding = os .getenv ("USE_CONTENT_UNDERSTANDING" , "" ).lower () == "true" ,
613
+ content_understanding_endpoint = os .getenv ("AZURE_CONTENTUNDERSTANDING_ENDPOINT" ),
581
614
use_multimodal = USE_MULTIMODAL ,
615
+ openai_client = openai_client ,
616
+ openai_model = OPENAI_CHATGPT_MODEL ,
617
+ openai_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT if OPENAI_HOST == OpenAIHost .AZURE else None ,
582
618
)
583
619
search_info = await setup_search_info (
584
620
search_service = AZURE_SEARCH_SERVICE , index_name = AZURE_SEARCH_INDEX , azure_credential = azure_credential
@@ -608,63 +644,10 @@ async def setup_clients():
608
644
embeddings = text_embeddings_service ,
609
645
image_embeddings = image_embeddings_service ,
610
646
search_field_name_embedding = AZURE_SEARCH_FIELD_NAME_EMBEDDING ,
611
- blob_manager = user_blob_container_client ,
647
+ blob_manager = AdlsBlobManager ( user_blob_container_client ) ,
612
648
)
613
649
current_app .config [CONFIG_INGESTER ] = ingester
614
650
615
- # Used by the OpenAI SDK
616
- openai_client : AsyncOpenAI
617
-
618
- if USE_SPEECH_OUTPUT_AZURE :
619
- current_app .logger .info ("USE_SPEECH_OUTPUT_AZURE is true, setting up Azure speech service" )
620
- if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "" :
621
- raise ValueError ("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID" )
622
- if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "" :
623
- raise ValueError ("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION" )
624
- current_app .config [CONFIG_SPEECH_SERVICE_ID ] = AZURE_SPEECH_SERVICE_ID
625
- current_app .config [CONFIG_SPEECH_SERVICE_LOCATION ] = AZURE_SPEECH_SERVICE_LOCATION
626
- current_app .config [CONFIG_SPEECH_SERVICE_VOICE ] = AZURE_SPEECH_SERVICE_VOICE
627
- # Wait until token is needed to fetch for the first time
628
- current_app .config [CONFIG_SPEECH_SERVICE_TOKEN ] = None
629
-
630
- if OPENAI_HOST .startswith ("azure" ):
631
- if OPENAI_HOST == "azure_custom" :
632
- current_app .logger .info ("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client" )
633
- if not AZURE_OPENAI_CUSTOM_URL :
634
- raise ValueError ("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom" )
635
- endpoint = AZURE_OPENAI_CUSTOM_URL
636
- else :
637
- current_app .logger .info ("OPENAI_HOST is azure, setting up Azure OpenAI client" )
638
- if not AZURE_OPENAI_SERVICE :
639
- raise ValueError ("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure" )
640
- endpoint = f"https://{ AZURE_OPENAI_SERVICE } .openai.azure.com"
641
- if AZURE_OPENAI_API_KEY_OVERRIDE :
642
- current_app .logger .info ("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client" )
643
- openai_client = AsyncAzureOpenAI (
644
- api_version = AZURE_OPENAI_API_VERSION , azure_endpoint = endpoint , api_key = AZURE_OPENAI_API_KEY_OVERRIDE
645
- )
646
- else :
647
- current_app .logger .info ("Using Azure credential (passwordless authentication) for Azure OpenAI client" )
648
- openai_client = AsyncAzureOpenAI (
649
- api_version = AZURE_OPENAI_API_VERSION ,
650
- azure_endpoint = endpoint ,
651
- azure_ad_token_provider = azure_ai_token_provider ,
652
- )
653
- elif OPENAI_HOST == "local" :
654
- current_app .logger .info ("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key" )
655
- openai_client = AsyncOpenAI (
656
- base_url = os .environ ["OPENAI_BASE_URL" ],
657
- api_key = "no-key-required" ,
658
- )
659
- else :
660
- current_app .logger .info (
661
- "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables"
662
- )
663
- openai_client = AsyncOpenAI (
664
- api_key = OPENAI_API_KEY ,
665
- organization = OPENAI_ORGANIZATION ,
666
- )
667
-
668
651
current_app .config [CONFIG_OPENAI_CLIENT ] = openai_client
669
652
current_app .config [CONFIG_SEARCH_CLIENT ] = search_client
670
653
current_app .config [CONFIG_AGENT_CLIENT ] = agent_client
0 commit comments