|
1 | 1 | import logging |
2 | 2 | from http import HTTPStatus |
3 | 3 | from typing import List, Optional |
| 4 | +from urllib.parse import urlparse, urlunparse, unquote, quote |
4 | 5 |
|
| 6 | +import httpx |
5 | 7 | from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile |
6 | 8 | from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse |
7 | 9 |
|
|
12 | 14 |
|
13 | 15 | logger = logging.getLogger("file_management_app") |
14 | 16 |
|
| 17 | + |
| 18 | +def build_content_disposition_header(filename: str) -> str: |
| 19 | + """ |
| 20 | + Build Content-Disposition header with proper encoding for filenames containing non-ASCII characters. |
| 21 | + |
| 22 | + Uses RFC 5987 format to support UTF-8 encoded filenames: |
| 23 | + - filename: ASCII-compatible fallback (URL-encoded ASCII string) |
| 24 | + - filename*: UTF-8 encoded filename (RFC 5987 format) |
| 25 | + |
| 26 | + Args: |
| 27 | + filename: Original filename (may contain non-ASCII characters) |
| 28 | + |
| 29 | + Returns: |
| 30 | + Content-Disposition header value |
| 31 | + """ |
| 32 | + try: |
| 33 | + # Check if filename contains non-ASCII characters |
| 34 | + try: |
| 35 | + filename.encode('ascii') |
| 36 | + has_non_ascii = False |
| 37 | + except UnicodeEncodeError: |
| 38 | + has_non_ascii = True |
| 39 | + |
| 40 | + if has_non_ascii: |
| 41 | + # Use RFC 5987 format for UTF-8 filenames |
| 42 | + # Format: filename*=UTF-8''encoded_filename |
| 43 | + # URL-encode the filename for the filename* parameter |
| 44 | + encoded_filename = quote(filename, safe='') |
| 45 | + |
| 46 | + # Create ASCII-compatible fallback filename |
| 47 | + # Extract file extension if available |
| 48 | + import os |
| 49 | + _, ext = os.path.splitext(filename) |
| 50 | + # Use a generic ASCII name with the same extension |
| 51 | + fallback_name = f"download{ext}" if ext else "download" |
| 52 | + |
| 53 | + # Return header with both filename (ASCII fallback) and filename* (UTF-8) |
| 54 | + return f'attachment; filename="{fallback_name}"; filename*=UTF-8\'\'{encoded_filename}' |
| 55 | + else: |
| 56 | + # Pure ASCII filename, use simple format |
| 57 | + return f'attachment; filename="{filename}"' |
| 58 | + except Exception as e: |
| 59 | + logger.warning(f"Failed to encode filename '{filename}': {e}, using fallback") |
| 60 | + # Fallback: use generic name |
| 61 | + return f'attachment; filename="download"' |
| 62 | + |
15 | 63 | # Create API router |
16 | 64 | file_management_runtime_router = APIRouter(prefix="/file") |
17 | 65 | file_management_config_router = APIRouter(prefix="/file") |
@@ -98,6 +146,62 @@ async def process_files( |
98 | 146 | ) |
99 | 147 |
|
100 | 148 |
|
| 149 | +@file_management_config_router.get("/download/{object_name:path}") |
| 150 | +async def get_storage_file( |
| 151 | + object_name: str = PathParam(..., description="File object name"), |
| 152 | + download: str = Query("ignore", description="How to get the file"), |
| 153 | + expires: int = Query(3600, description="URL validity period (seconds)"), |
| 154 | + filename: Optional[str] = Query(None, description="Original filename for download (optional)") |
| 155 | +): |
| 156 | + """ |
| 157 | + Get information, download link, or file stream for a single file |
| 158 | +
|
| 159 | + - **object_name**: File object name |
| 160 | + - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL) |
| 161 | + - **expires**: URL validity period in seconds (default 3600) |
| 162 | + - **filename**: Original filename for download (optional, if not provided, will use object_name) |
| 163 | +
|
| 164 | + Returns file information, download link, or file content |
| 165 | + """ |
| 166 | + try: |
| 167 | + logger.info(f"[get_storage_file] Route matched! object_name={object_name}, download={download}, filename={filename}") |
| 168 | + if download == "redirect": |
| 169 | + # return a redirect download URL |
| 170 | + result = await get_file_url_impl(object_name=object_name, expires=expires) |
| 171 | + return RedirectResponse(url=result["url"]) |
| 172 | + elif download == "stream": |
| 173 | + # return a readable file stream |
| 174 | + file_stream, content_type = await get_file_stream_impl(object_name=object_name) |
| 175 | + logger.info(f"Streaming file: object_name={object_name}, content_type={content_type}") |
| 176 | + |
| 177 | + # Use provided filename or extract from object_name |
| 178 | + download_filename = filename |
| 179 | + if not download_filename: |
| 180 | + # Extract filename from object_name (get the last part after the last slash) |
| 181 | + download_filename = object_name.split("/")[-1] if "/" in object_name else object_name |
| 182 | + |
| 183 | + # Build Content-Disposition header with proper encoding for non-ASCII characters |
| 184 | + content_disposition = build_content_disposition_header(download_filename) |
| 185 | + |
| 186 | + return StreamingResponse( |
| 187 | + file_stream, |
| 188 | + media_type=content_type, |
| 189 | + headers={ |
| 190 | + "Content-Disposition": content_disposition |
| 191 | + } |
| 192 | + ) |
| 193 | + else: |
| 194 | + # return file metadata |
| 195 | + return await get_file_url_impl(object_name=object_name, expires=expires) |
| 196 | + except Exception as e: |
| 197 | + logger.error(f"Failed to get file: object_name={object_name}, error={str(e)}") |
| 198 | + raise HTTPException( |
| 199 | + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, |
| 200 | + detail=f"Failed to get file information: {str(e)}" |
| 201 | + ) |
| 202 | + |
| 203 | + |
| 204 | + |
101 | 205 | @file_management_runtime_router.post("/storage") |
102 | 206 | async def storage_upload_files( |
103 | 207 | files: List[UploadFile] = File(..., description="List of files to upload"), |
@@ -158,43 +262,177 @@ async def get_storage_files( |
158 | 262 | ) |
159 | 263 |
|
160 | 264 |
|
161 | | -@file_management_config_router.get("/storage/{path}/{object_name}") |
162 | | -async def get_storage_file( |
163 | | - object_name: str = PathParam(..., description="File object name"), |
164 | | - download: str = Query("ignore", description="How to get the file"), |
165 | | - expires: int = Query(3600, description="URL validity period (seconds)") |
| 265 | +def _normalize_datamate_download_url(raw_url: str) -> str: |
| 266 | + """ |
| 267 | + Normalize Datamate download URL to ensure it follows /data-management/datasets/{datasetId}/files/{fileId}/download |
| 268 | + """ |
| 269 | + parsed_url = urlparse(raw_url) |
| 270 | + path_segments = [segment for segment in parsed_url.path.split("/") if segment] |
| 271 | + |
| 272 | + if "data-management" not in path_segments: |
| 273 | + raise HTTPException( |
| 274 | + status_code=HTTPStatus.BAD_REQUEST, |
| 275 | + detail="Invalid Datamate URL: missing 'data-management' segment" |
| 276 | + ) |
| 277 | + |
| 278 | + try: |
| 279 | + dm_index = path_segments.index("data-management") |
| 280 | + datasets_index = path_segments.index("datasets", dm_index) |
| 281 | + dataset_id = path_segments[datasets_index + 1] |
| 282 | + files_index = path_segments.index("files", datasets_index) |
| 283 | + file_id = path_segments[files_index + 1] |
| 284 | + except (ValueError, IndexError): |
| 285 | + raise HTTPException( |
| 286 | + status_code=HTTPStatus.BAD_REQUEST, |
| 287 | + detail="Invalid Datamate URL: unable to parse dataset_id or file_id" |
| 288 | + ) |
| 289 | + |
| 290 | + prefix_segments = path_segments[:dm_index] |
| 291 | + prefix_path = "/" + "/".join(prefix_segments) if prefix_segments else "" |
| 292 | + normalized_path = f"{prefix_path}/data-management/datasets/{dataset_id}/files/{file_id}/download" |
| 293 | + |
| 294 | + normalized_url = urlunparse(( |
| 295 | + parsed_url.scheme, |
| 296 | + parsed_url.netloc, |
| 297 | + normalized_path, |
| 298 | + "", |
| 299 | + "", |
| 300 | + "" |
| 301 | + )) |
| 302 | + |
| 303 | + return normalized_url |
| 304 | + |
| 305 | + |
| 306 | +def _build_datamate_url_from_parts(base_url: str, dataset_id: str, file_id: str) -> str: |
| 307 | + """ |
| 308 | + Build Datamate download URL from individual parts |
| 309 | + """ |
| 310 | + if not base_url: |
| 311 | + raise HTTPException( |
| 312 | + status_code=HTTPStatus.BAD_REQUEST, |
| 313 | + detail="base_url is required when dataset_id and file_id are provided" |
| 314 | + ) |
| 315 | + |
| 316 | + parsed_base = urlparse(base_url) |
| 317 | + base_prefix = parsed_base.path.rstrip("/") |
| 318 | + |
| 319 | + if base_prefix and not base_prefix.endswith("/api"): |
| 320 | + if base_prefix.endswith("/"): |
| 321 | + base_prefix = f"{base_prefix}api" |
| 322 | + else: |
| 323 | + base_prefix = f"{base_prefix}/api" |
| 324 | + elif not base_prefix: |
| 325 | + base_prefix = "/api" |
| 326 | + |
| 327 | + normalized_path = f"{base_prefix}/data-management/datasets/{dataset_id}/files/{file_id}/download" |
| 328 | + |
| 329 | + return urlunparse(( |
| 330 | + parsed_base.scheme, |
| 331 | + parsed_base.netloc, |
| 332 | + normalized_path, |
| 333 | + "", |
| 334 | + "", |
| 335 | + "" |
| 336 | + )) |
| 337 | + |
| 338 | + |
| 339 | +@file_management_config_router.get("/datamate/download") |
| 340 | +async def download_datamate_file( |
| 341 | + url: Optional[str] = Query(None, description="Datamate file URL to download"), |
| 342 | + base_url: Optional[str] = Query(None, description="Datamate base server URL (e.g., http://host:port or http://host:port/api)"), |
| 343 | + dataset_id: Optional[str] = Query(None, description="Datamate dataset ID"), |
| 344 | + file_id: Optional[str] = Query(None, description="Datamate file ID"), |
| 345 | + filename: Optional[str] = Query(None, description="Optional filename for download"), |
| 346 | + authorization: Optional[str] = Header(None, alias="Authorization") |
166 | 347 | ): |
167 | 348 | """ |
168 | | - Get information, download link, or file stream for a single file |
| 349 | + Download file from Datamate knowledge base via HTTP URL |
169 | 350 |
|
170 | | - - **object_name**: File object name |
171 | | - - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL) |
172 | | - - **expires**: URL validity period in seconds (default 3600) |
| 351 | + - **url**: Full HTTP URL of the file to download (optional) |
| 352 | + - **base_url**: Base server URL (e.g., http://host:port or http://host:port/api) |
| 353 | + - **dataset_id**: Datamate dataset ID |
| 354 | + - **file_id**: Datamate file ID |
| 355 | + - **filename**: Optional filename for the download (extracted automatically if not provided) |
| 356 | + - **authorization**: Optional authorizatio n header to pass to the target URL |
173 | 357 |
|
174 | | - Returns file information, download link, or file content |
| 358 | + Returns file stream for download |
175 | 359 | """ |
176 | 360 | try: |
177 | | - if download == "redirect": |
178 | | - # return a redirect download URL |
179 | | - result = await get_file_url_impl(object_name=object_name, expires=expires) |
180 | | - return RedirectResponse(url=result["url"]) |
181 | | - elif download == "stream": |
182 | | - # return a readable file stream |
183 | | - file_stream, content_type = await get_file_stream_impl(object_name=object_name) |
| 361 | + if url: |
| 362 | + logger.info(f"[download_datamate_file] Using full URL: {url}") |
| 363 | + normalized_url = _normalize_datamate_download_url(url) |
| 364 | + elif base_url and dataset_id and file_id: |
| 365 | + logger.info(f"[download_datamate_file] Building URL from parts: base_url={base_url}, dataset_id={dataset_id}, file_id={file_id}") |
| 366 | + normalized_url = _build_datamate_url_from_parts(base_url, dataset_id, file_id) |
| 367 | + else: |
| 368 | + raise HTTPException( |
| 369 | + status_code=HTTPStatus.BAD_REQUEST, |
| 370 | + detail="Either url or (base_url, dataset_id, file_id) must be provided" |
| 371 | + ) |
| 372 | + |
| 373 | + logger.info(f"[download_datamate_file] Normalized download URL: {normalized_url}") |
| 374 | + logger.info(f"[download_datamate_file] Authorization header present: {authorization is not None}") |
| 375 | + |
| 376 | + headers = {} |
| 377 | + if authorization: |
| 378 | + headers["Authorization"] = authorization |
| 379 | + logger.debug(f"[download_datamate_file] Using authorization header: {authorization[:20]}...") |
| 380 | + headers["User-Agent"] = "Nexent-File-Downloader/1.0" |
| 381 | + |
| 382 | + logger.info(f"[download_datamate_file] Request headers: {list(headers.keys())}") |
| 383 | + |
| 384 | + async with httpx.AsyncClient(timeout=30.0) as client: |
| 385 | + response = await client.get(normalized_url, headers=headers, follow_redirects=True) |
| 386 | + logger.info(f"[download_datamate_file] Response status: {response.status_code}") |
| 387 | + |
| 388 | + if response.status_code == 404: |
| 389 | + logger.error(f"[download_datamate_file] File not found at URL: {normalized_url}") |
| 390 | + logger.error(f"[download_datamate_file] Response headers: {dict(response.headers)}") |
| 391 | + raise HTTPException( |
| 392 | + status_code=HTTPStatus.NOT_FOUND, |
| 393 | + detail="File not found. Please verify dataset_id and file_id." |
| 394 | + ) |
| 395 | + |
| 396 | + response.raise_for_status() |
| 397 | + |
| 398 | + content_type = response.headers.get("Content-Type", "application/octet-stream") |
| 399 | + |
| 400 | + download_filename = filename |
| 401 | + if not download_filename: |
| 402 | + content_disposition = response.headers.get("Content-Disposition", "") |
| 403 | + if content_disposition: |
| 404 | + import re |
| 405 | + filename_match = re.search(r'filename="?(.+?)"?$', content_disposition) |
| 406 | + if filename_match: |
| 407 | + download_filename = filename_match.group(1) |
| 408 | + |
| 409 | + if not download_filename: |
| 410 | + path = unquote(urlparse(normalized_url).path) |
| 411 | + download_filename = path.split('/')[-1] or "download" |
| 412 | + |
| 413 | + # Build Content-Disposition header with proper encoding for non-ASCII characters |
| 414 | + content_disposition = build_content_disposition_header(download_filename) |
| 415 | + |
184 | 416 | return StreamingResponse( |
185 | | - file_stream, |
| 417 | + iter([response.content]), |
186 | 418 | media_type=content_type, |
187 | 419 | headers={ |
188 | | - "Content-Disposition": f'inline; filename="{object_name}"' |
| 420 | + "Content-Disposition": content_disposition |
189 | 421 | } |
190 | 422 | ) |
191 | | - else: |
192 | | - # return file metadata |
193 | | - return await get_file_url_impl(object_name=object_name, expires=expires) |
| 423 | + except httpx.HTTPError as e: |
| 424 | + logger.error(f"Failed to download file from URL {url}: {str(e)}") |
| 425 | + raise HTTPException( |
| 426 | + status_code=HTTPStatus.BAD_GATEWAY, |
| 427 | + detail=f"Failed to download file from URL: {str(e)}" |
| 428 | + ) |
| 429 | + except HTTPException: |
| 430 | + raise |
194 | 431 | except Exception as e: |
| 432 | + logger.error(f"Failed to download datamate file: {str(e)}") |
195 | 433 | raise HTTPException( |
196 | 434 | status_code=HTTPStatus.INTERNAL_SERVER_ERROR, |
197 | | - detail=f"Failed to get file information: {str(e)}" |
| 435 | + detail=f"Failed to download file: {str(e)}" |
198 | 436 | ) |
199 | 437 |
|
200 | 438 |
|
|
0 commit comments