|
14 | 14 | from openai.types.chat import ChatCompletionChunk |
15 | 15 | from flask import Flask, Response, request, Request, jsonify |
16 | 16 | from dotenv import load_dotenv |
17 | | -from urllib.parse import quote |
| 17 | +from urllib.parse import quote, urlparse |
18 | 18 | from backend.batch.utilities.helpers.env_helper import EnvHelper |
19 | 19 | from backend.batch.utilities.helpers.azure_search_helper import AzureSearchHelper |
20 | 20 | from backend.batch.utilities.helpers.orchestrator_helper import Orchestrator |
21 | 21 | from backend.batch.utilities.helpers.config.config_helper import ConfigHelper |
22 | 22 | from backend.batch.utilities.helpers.config.conversation_flow import ConversationFlow |
23 | 23 | from backend.api.chat_history import bp_chat_history_response |
24 | 24 | from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient |
| 25 | +from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundError, ServiceRequestError |
25 | 26 | from backend.batch.utilities.helpers.azure_credential_utils import get_azure_credential |
26 | 27 | from backend.batch.utilities.helpers.azure_blob_storage_client import ( |
27 | 28 | AzureBlobStorageClient, |
@@ -419,6 +420,96 @@ def static_file(path): |
419 | 420 | def health(): |
420 | 421 | return "OK" |
421 | 422 |
|
| 423 | + @app.route("/api/files/<filename>", methods=["GET"]) |
| 424 | + def get_file(filename): |
| 425 | + """ |
| 426 | + Download a file from the 'docs' container in Azure Blob Storage using Managed Identity. |
| 427 | +
|
| 428 | + Args: |
| 429 | + filename (str): Name of the file to retrieve from storage |
| 430 | +
|
| 431 | + Returns: |
| 432 | + Flask Response: The file content with appropriate headers, or error response |
| 433 | + """ |
| 434 | + logger.info("File download request for: %s", filename) |
| 435 | + |
| 436 | + try: |
| 437 | + # Enhanced input validation - prevent path traversal and unauthorized access |
| 438 | + if not filename: |
| 439 | + logger.warning("Empty filename provided") |
| 440 | + return jsonify({"error": "Filename is required"}), 400 |
| 441 | + |
| 442 | + # Prevent path traversal attacks |
| 443 | + if '..' in filename or '/' in filename or '\\' in filename: |
| 444 | + logger.warning("Invalid filename with path traversal attempt: %s", filename) |
| 445 | + return jsonify({"error": "Invalid filename"}), 400 |
| 446 | + |
| 447 | + # Validate filename length and characters |
| 448 | + if len(filename) > 255: |
| 449 | + logger.warning("Filename too long: %s", filename) |
| 450 | + return jsonify({"error": "Filename too long"}), 400 |
| 451 | + |
| 452 | + # Only allow safe characters (alphanumeric, dots, dashes, underscores, spaces) |
| 453 | + if not re.match(r'^[a-zA-Z0-9._\-\s]+$', filename): |
| 454 | + logger.warning("Filename contains invalid characters: %s", filename) |
| 455 | + return jsonify({"error": "Invalid filename characters"}), 400 |
| 456 | + |
| 457 | + # Initialize blob storage client with 'documents' container |
| 458 | + blob_client = AzureBlobStorageClient(container_name="documents") |
| 459 | + |
| 460 | + # Check if file exists |
| 461 | + if not blob_client.file_exists(filename): |
| 462 | + logger.info("File not found: %s", filename) |
| 463 | + return jsonify({"error": "File not found"}), 404 |
| 464 | + |
| 465 | + # Download the file |
| 466 | + file_data = blob_client.download_file(filename) |
| 467 | + |
| 468 | + # Determine content type based on file extension |
| 469 | + content_type, _ = mimetypes.guess_type(filename) |
| 470 | + if not content_type: |
| 471 | + content_type = 'application/octet-stream' |
| 472 | + |
| 473 | + file_size = len(file_data) |
| 474 | + logger.info("File downloaded successfully: %s, size: %d bytes", filename, file_size) |
| 475 | + |
| 476 | + # For large files (>10MB), consider implementing streaming |
| 477 | + if file_size > 10 * 1024 * 1024: # 10MB threshold |
| 478 | + logger.info("Large file detected: %s, size: %d bytes", filename, file_size) |
| 479 | + |
| 480 | + # Create response with comprehensive headers |
| 481 | + response = Response( |
| 482 | + file_data, |
| 483 | + status=200, |
| 484 | + mimetype=content_type, |
| 485 | + headers={ |
| 486 | + 'Content-Disposition': f'inline; filename="{filename}"', |
| 487 | + 'Content-Length': str(file_size), |
| 488 | + 'Cache-Control': 'public, max-age=3600', |
| 489 | + 'X-Content-Type-Options': 'nosniff', |
| 490 | + 'X-Frame-Options': 'DENY', |
| 491 | + 'Content-Security-Policy': "default-src 'none'" |
| 492 | + } |
| 493 | + ) |
| 494 | + |
| 495 | + return response |
| 496 | + |
| 497 | + except (ClientAuthenticationError, ResourceNotFoundError, ServiceRequestError) as e: |
| 498 | + # Handle specific Azure errors |
| 499 | + if isinstance(e, ClientAuthenticationError): |
| 500 | + logger.error("Authentication failed for file %s: %s", filename, str(e)) |
| 501 | + return jsonify({"error": "Authentication failed"}), 401 |
| 502 | + elif isinstance(e, ResourceNotFoundError): |
| 503 | + logger.info("File not found: %s", filename) |
| 504 | + return jsonify({"error": "File not found"}), 404 |
| 505 | + elif isinstance(e, ServiceRequestError): |
| 506 | + logger.error("Storage service error for file %s: %s", filename, str(e)) |
| 507 | + return jsonify({"error": "Storage service unavailable"}), 503 |
| 508 | + except Exception as e: |
| 509 | + error_message = str(e) |
| 510 | + logger.exception("Unexpected error downloading file %s: %s", filename, error_message) |
| 511 | + return jsonify({"error": "Internal server error"}), 500 |
| 512 | + |
422 | 513 | def conversation_azure_byod(): |
423 | 514 | logger.info("Method conversation_azure_byod started") |
424 | 515 | try: |
|
0 commit comments