Skip to content

Commit 853cc72

Browse files
authored
Optional upload documents feature (#1395)
* Initial commit for upload docs feature * Working locally * Move prepdocs into backend, implement delete and list * Initial upload tests * Merge from main * More upload tests * Another test * E2E draft * Added to Ask page, more tests, documentation * Fix types * A few adjustments * Increase test coverage * Add tests for content path * Test for 404 * update link * Add upload test * Add doc intel role * Disable button when logged out * Update prepdocs.ps1 and docs with right path * Improve deletion and list flow, rm unused CSS, add a test * Correct product name, remove deleteRetentionPolicy * Add missing role for adding to index * Use blob container clients only * Type mismatch * Avoid infinite loop in remove_content * Handle the case of user directory not existing yet * Add check for directory client * Resolve typing mismatch * Need roles specific to each RG * Simplify test * fix start.ps1 * Add uploaded file error message --------- Co-authored-by: Matt Gotteiner <[email protected]>
1 parent 2d8eb61 commit 853cc72

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+1300
-518
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,4 +146,6 @@ npm-debug.log*
146146
node_modules
147147
static/
148148

149-
data/**/*.md5
149+
data/**/*.md5
150+
151+
.DS_Store

.vscode/launch.json

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"request": "launch",
1111
"module": "quart",
1212
"cwd": "${workspaceFolder}/app/backend",
13-
"python": "${workspaceFolder}/app/backend/backend_env/bin/python",
13+
"python": "${workspaceFolder}/.venv/bin/python",
1414
"env": {
1515
"QUART_APP": "main:app",
1616
"QUART_ENV": "development",
@@ -31,7 +31,16 @@
3131
"request": "launch",
3232
"command": "npm run dev",
3333
"cwd": "${workspaceFolder}/app/frontend",
34-
}
34+
},
35+
{
36+
"name": "Python: Debug Tests",
37+
"type": "debugpy",
38+
"request": "launch",
39+
"program": "${file}",
40+
"purpose": ["debug-test"],
41+
"console": "integratedTerminal",
42+
"justMyCode": false
43+
}
3544
],
3645
"inputs": [
3746
{

app/backend/app.py

Lines changed: 170 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
from azure.monitor.opentelemetry import configure_azure_monitor
1616
from azure.search.documents.aio import SearchClient
1717
from azure.search.documents.indexes.aio import SearchIndexClient
18-
from azure.storage.blob.aio import BlobServiceClient
18+
from azure.storage.blob.aio import ContainerClient
19+
from azure.storage.blob.aio import StorageStreamDownloader as BlobDownloader
20+
from azure.storage.filedatalake.aio import FileSystemClient
21+
from azure.storage.filedatalake.aio import StorageStreamDownloader as DatalakeDownloader
1922
from openai import AsyncAzureOpenAI, AsyncOpenAI
2023
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
2124
from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware
@@ -49,14 +52,25 @@
4952
CONFIG_CHAT_APPROACH,
5053
CONFIG_CHAT_VISION_APPROACH,
5154
CONFIG_GPT4V_DEPLOYED,
55+
CONFIG_INGESTER,
5256
CONFIG_OPENAI_CLIENT,
5357
CONFIG_SEARCH_CLIENT,
5458
CONFIG_SEMANTIC_RANKER_DEPLOYED,
59+
CONFIG_USER_BLOB_CONTAINER_CLIENT,
60+
CONFIG_USER_UPLOAD_ENABLED,
5561
CONFIG_VECTOR_SEARCH_ENABLED,
5662
)
5763
from core.authentication import AuthenticationHelper
5864
from decorators import authenticated, authenticated_path
5965
from error import error_dict, error_response
66+
from prepdocs import (
67+
clean_key_if_exists,
68+
setup_embeddings_service,
69+
setup_file_processors,
70+
setup_search_info,
71+
)
72+
from prepdocslib.filestrategy import UploadUserFileStrategy
73+
from prepdocslib.listfilestrategy import File
6074

6175
bp = Blueprint("routes", __name__, static_folder="static")
6276
# Fix Windows registry issue with mimetypes
@@ -88,7 +102,7 @@ async def assets(path):
88102

89103
@bp.route("/content/<path>")
90104
@authenticated_path
91-
async def content_file(path: str):
105+
async def content_file(path: str, auth_claims: Dict[str, Any]):
92106
"""
93107
Serve content files from blob storage from within the app to keep the example self-contained.
94108
*** NOTE *** if you are using app services authentication, this route will return unauthorized to all users that are not logged in
@@ -102,12 +116,24 @@ async def content_file(path: str):
102116
path_parts = path.rsplit("#page=", 1)
103117
path = path_parts[0]
104118
logging.info("Opening file %s", path)
105-
blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
119+
blob_container_client: ContainerClient = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
120+
blob: Union[BlobDownloader, DatalakeDownloader]
106121
try:
107122
blob = await blob_container_client.get_blob_client(path).download_blob()
108123
except ResourceNotFoundError:
109-
logging.exception("Path not found: %s", path)
110-
abort(404)
124+
logging.info("Path not found in general Blob container: %s", path)
125+
if current_app.config[CONFIG_USER_UPLOAD_ENABLED]:
126+
try:
127+
user_oid = auth_claims["oid"]
128+
user_blob_container_client = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
129+
user_directory_client: FileSystemClient = user_blob_container_client.get_directory_client(user_oid)
130+
file_client = user_directory_client.get_file_client(path)
131+
blob = await file_client.download_file()
132+
except ResourceNotFoundError:
133+
logging.exception("Path not found in DataLake: %s", path)
134+
abort(404)
135+
else:
136+
abort(404)
111137
if not blob.properties or not blob.properties.has_key("content_settings"):
112138
abort(404)
113139
mime_type = blob.properties["content_settings"]["content_type"]
@@ -205,15 +231,78 @@ def config():
205231
"showGPT4VOptions": current_app.config[CONFIG_GPT4V_DEPLOYED],
206232
"showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED],
207233
"showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
234+
"showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
208235
}
209236
)
210237

211238

239+
@bp.post("/upload")
240+
@authenticated
241+
async def upload(auth_claims: dict[str, Any]):
242+
request_files = await request.files
243+
if "file" not in request_files:
244+
# If no files were included in the request, return an error response
245+
return jsonify({"message": "No file part in the request", "status": "failed"}), 400
246+
247+
user_oid = auth_claims["oid"]
248+
file = request_files.getlist("file")[0]
249+
user_blob_container_client: FileSystemClient = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
250+
user_directory_client = user_blob_container_client.get_directory_client(user_oid)
251+
try:
252+
await user_directory_client.get_directory_properties()
253+
except ResourceNotFoundError:
254+
current_app.logger.info("Creating directory for user %s", user_oid)
255+
await user_directory_client.create_directory()
256+
await user_directory_client.set_access_control(owner=user_oid)
257+
file_client = user_directory_client.get_file_client(file.filename)
258+
file_io = file
259+
file_io.name = file.filename
260+
file_io = io.BufferedReader(file_io)
261+
await file_client.upload_data(file_io, overwrite=True, metadata={"UploadedBy": user_oid})
262+
file_io.seek(0)
263+
ingester = current_app.config[CONFIG_INGESTER]
264+
await ingester.add_file(File(content=file_io, acls={"oids": [user_oid]}))
265+
return jsonify({"message": "File uploaded successfully"}), 200
266+
267+
268+
@bp.post("/delete_uploaded")
269+
@authenticated
270+
async def delete_uploaded(auth_claims: dict[str, Any]):
271+
request_json = await request.get_json()
272+
filename = request_json.get("filename")
273+
user_oid = auth_claims["oid"]
274+
user_blob_container_client: FileSystemClient = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
275+
user_directory_client = user_blob_container_client.get_directory_client(user_oid)
276+
file_client = user_directory_client.get_file_client(filename)
277+
await file_client.delete_file()
278+
ingester = current_app.config[CONFIG_INGESTER]
279+
await ingester.remove_file(filename, user_oid)
280+
return jsonify({"message": f"File {filename} deleted successfully"}), 200
281+
282+
283+
@bp.get("/list_uploaded")
284+
@authenticated
285+
async def list_uploaded(auth_claims: dict[str, Any]):
286+
user_oid = auth_claims["oid"]
287+
user_blob_container_client: FileSystemClient = current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT]
288+
files = []
289+
try:
290+
all_paths = user_blob_container_client.get_paths(path=user_oid)
291+
async for path in all_paths:
292+
files.append(path.name.split("/", 1)[1])
293+
except ResourceNotFoundError as error:
294+
if error.status_code != 404:
295+
current_app.logger.exception("Error listing uploaded files", error)
296+
return jsonify(files), 200
297+
298+
212299
@bp.before_app_serving
213300
async def setup_clients():
214301
# Replace these with your own values, either in environment variables or directly here
215302
AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"]
216303
AZURE_STORAGE_CONTAINER = os.environ["AZURE_STORAGE_CONTAINER"]
304+
AZURE_USERSTORAGE_ACCOUNT = os.environ.get("AZURE_USERSTORAGE_ACCOUNT")
305+
AZURE_USERSTORAGE_CONTAINER = os.environ.get("AZURE_USERSTORAGE_CONTAINER")
217306
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
218307
AZURE_SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX"]
219308
AZURE_SEARCH_SECRET_NAME = os.getenv("AZURE_SEARCH_SECRET_NAME")
@@ -252,6 +341,7 @@ async def setup_clients():
252341
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
253342

254343
USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true"
344+
USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true"
255345

256346
# Use the current user identity to authenticate with Azure OpenAI, AI Search and Blob Storage (no secrets needed,
257347
# just use 'az login' locally, and managed identity when deployed on Azure). If you need to use keys, use separate AzureKeyCredential instances with the
@@ -262,11 +352,12 @@ async def setup_clients():
262352
# Fetch any necessary secrets from Key Vault
263353
search_key = None
264354
if AZURE_KEY_VAULT_NAME:
265-
key_vault_client = SecretClient(
355+
async with SecretClient(
266356
vault_url=f"https://{AZURE_KEY_VAULT_NAME}.vault.azure.net", credential=azure_credential
267-
)
268-
search_key = AZURE_SEARCH_SECRET_NAME and (await key_vault_client.get_secret(AZURE_SEARCH_SECRET_NAME)).value
269-
await key_vault_client.close()
357+
) as key_vault_client:
358+
search_key = (
359+
AZURE_SEARCH_SECRET_NAME and (await key_vault_client.get_secret(AZURE_SEARCH_SECRET_NAME)).value
360+
)
270361

271362
# Set up clients for AI Search and Storage
272363
search_credential: Union[AsyncTokenCredential, AzureKeyCredential] = (
@@ -277,19 +368,22 @@ async def setup_clients():
277368
index_name=AZURE_SEARCH_INDEX,
278369
credential=search_credential,
279370
)
280-
search_index_client = SearchIndexClient(
281-
endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
282-
credential=search_credential,
283-
)
284371

285-
blob_client = BlobServiceClient(
286-
account_url=f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", credential=azure_credential
372+
blob_container_client = ContainerClient(
373+
f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", AZURE_STORAGE_CONTAINER, credential=azure_credential
287374
)
288-
blob_container_client = blob_client.get_container_client(AZURE_STORAGE_CONTAINER)
289375

290376
# Set up authentication helper
377+
search_index = None
378+
if AZURE_USE_AUTHENTICATION:
379+
search_index_client = SearchIndexClient(
380+
endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
381+
credential=search_credential,
382+
)
383+
search_index = await search_index_client.get_index(AZURE_SEARCH_INDEX)
384+
await search_index_client.close()
291385
auth_helper = AuthenticationHelper(
292-
search_index=(await search_index_client.get_index(AZURE_SEARCH_INDEX)) if AZURE_USE_AUTHENTICATION else None,
386+
search_index=search_index,
293387
use_authentication=AZURE_USE_AUTHENTICATION,
294388
server_app_id=AZURE_SERVER_APP_ID,
295389
server_app_secret=AZURE_SERVER_APP_SECRET,
@@ -298,6 +392,45 @@ async def setup_clients():
298392
require_access_control=AZURE_ENFORCE_ACCESS_CONTROL,
299393
)
300394

395+
if USE_USER_UPLOAD:
396+
current_app.logger.info("USE_USER_UPLOAD is true, setting up user upload feature")
397+
user_blob_container_client = FileSystemClient(
398+
f"https://{AZURE_USERSTORAGE_ACCOUNT}.dfs.core.windows.net",
399+
AZURE_USERSTORAGE_CONTAINER,
400+
credential=azure_credential,
401+
)
402+
current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT] = user_blob_container_client
403+
404+
# Set up ingester
405+
file_processors = setup_file_processors(
406+
azure_credential=azure_credential,
407+
document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
408+
local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true",
409+
local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER", "").lower() == "true",
410+
search_images=USE_GPT4V,
411+
)
412+
search_info = await setup_search_info(
413+
search_service=AZURE_SEARCH_SERVICE,
414+
index_name=AZURE_SEARCH_INDEX,
415+
azure_credential=azure_credential,
416+
search_key=clean_key_if_exists(search_key),
417+
)
418+
text_embeddings_service = setup_embeddings_service(
419+
azure_credential=azure_credential,
420+
openai_host=OPENAI_HOST,
421+
openai_model_name=OPENAI_EMB_MODEL,
422+
openai_service=AZURE_OPENAI_SERVICE,
423+
openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
424+
openai_dimensions=OPENAI_EMB_DIMENSIONS,
425+
openai_key=clean_key_if_exists(OPENAI_API_KEY),
426+
openai_org=OPENAI_ORGANIZATION,
427+
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
428+
)
429+
ingester = UploadUserFileStrategy(
430+
search_info=search_info, embeddings=text_embeddings_service, file_processors=file_processors
431+
)
432+
current_app.config[CONFIG_INGESTER] = ingester
433+
301434
# Used by the OpenAI SDK
302435
openai_client: AsyncOpenAI
303436

@@ -335,6 +468,7 @@ async def setup_clients():
335468
current_app.config[CONFIG_GPT4V_DEPLOYED] = bool(USE_GPT4V)
336469
current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
337470
current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
471+
current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
338472

339473
# Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns
340474
# or some derivative, here we include several for exploration purposes
@@ -353,7 +487,24 @@ async def setup_clients():
353487
query_speller=AZURE_SEARCH_QUERY_SPELLER,
354488
)
355489

490+
current_app.config[CONFIG_CHAT_APPROACH] = ChatReadRetrieveReadApproach(
491+
search_client=search_client,
492+
openai_client=openai_client,
493+
auth_helper=auth_helper,
494+
chatgpt_model=OPENAI_CHATGPT_MODEL,
495+
chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
496+
embedding_model=OPENAI_EMB_MODEL,
497+
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
498+
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
499+
sourcepage_field=KB_FIELDS_SOURCEPAGE,
500+
content_field=KB_FIELDS_CONTENT,
501+
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
502+
query_speller=AZURE_SEARCH_QUERY_SPELLER,
503+
)
504+
356505
if USE_GPT4V:
506+
current_app.logger.info("USE_GPT4V is true, setting up GPT4V approach")
507+
357508
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
358509

359510
current_app.config[CONFIG_ASK_VISION_APPROACH] = RetrieveThenReadVisionApproach(
@@ -392,26 +543,13 @@ async def setup_clients():
392543
query_speller=AZURE_SEARCH_QUERY_SPELLER,
393544
)
394545

395-
current_app.config[CONFIG_CHAT_APPROACH] = ChatReadRetrieveReadApproach(
396-
search_client=search_client,
397-
openai_client=openai_client,
398-
auth_helper=auth_helper,
399-
chatgpt_model=OPENAI_CHATGPT_MODEL,
400-
chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
401-
embedding_model=OPENAI_EMB_MODEL,
402-
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
403-
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
404-
sourcepage_field=KB_FIELDS_SOURCEPAGE,
405-
content_field=KB_FIELDS_CONTENT,
406-
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
407-
query_speller=AZURE_SEARCH_QUERY_SPELLER,
408-
)
409-
410546

411547
@bp.after_app_serving
412548
async def close_clients():
413549
await current_app.config[CONFIG_SEARCH_CLIENT].close()
414550
await current_app.config[CONFIG_BLOB_CONTAINER_CLIENT].close()
551+
if current_app.config.get(CONFIG_USER_BLOB_CONTAINER_CLIENT):
552+
await current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT].close()
415553

416554

417555
def create_app():

app/backend/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
CONFIG_CHAT_VISION_APPROACH = "chat_vision_approach"
66
CONFIG_CHAT_APPROACH = "chat_approach"
77
CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client"
8+
CONFIG_USER_UPLOAD_ENABLED = "user_upload_enabled"
9+
CONFIG_USER_BLOB_CONTAINER_CLIENT = "user_blob_container_client"
810
CONFIG_AUTH_CLIENT = "auth_client"
911
CONFIG_GPT4V_DEPLOYED = "gpt4v_deployed"
1012
CONFIG_SEMANTIC_RANKER_DEPLOYED = "semantic_ranker_deployed"
1113
CONFIG_VECTOR_SEARCH_ENABLED = "vector_search_enabled"
1214
CONFIG_SEARCH_CLIENT = "search_client"
1315
CONFIG_OPENAI_CLIENT = "openai_client"
16+
CONFIG_INGESTER = "ingester"

app/backend/decorators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from error import error_response
1010

1111

12-
def authenticated_path(route_fn: Callable[[str], Any]):
12+
def authenticated_path(route_fn: Callable[[str, Dict[str, Any]], Any]):
1313
"""
1414
Decorator for routes that request a specific file that might require access control enforcement
1515
"""
@@ -32,7 +32,7 @@ async def auth_handler(path=""):
3232
if not authorized:
3333
abort(403)
3434

35-
return await route_fn(path)
35+
return await route_fn(path, auth_claims)
3636

3737
return auth_handler
3838

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)