Skip to content

Commit 9f6444c

Browse files
committed
✨ 1. Knowledge base tracing results support downloading.
2. File download requests are made to the backend interface, instead of directly accessing Minio.
1 parent 23f5dd4 commit 9f6444c

File tree

11 files changed

+1108
-142
lines changed

11 files changed

+1108
-142
lines changed

backend/apps/file_management_app.py

Lines changed: 261 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import logging
22
from http import HTTPStatus
33
from typing import List, Optional
4+
from urllib.parse import urlparse, urlunparse, unquote, quote
45

6+
import httpx
57
from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile
68
from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
79

@@ -12,6 +14,52 @@
1214

1315
logger = logging.getLogger("file_management_app")
1416

17+
18+
def build_content_disposition_header(filename: str) -> str:
19+
"""
20+
Build Content-Disposition header with proper encoding for filenames containing non-ASCII characters.
21+
22+
Uses RFC 5987 format to support UTF-8 encoded filenames:
23+
- filename: ASCII-compatible fallback (URL-encoded ASCII string)
24+
- filename*: UTF-8 encoded filename (RFC 5987 format)
25+
26+
Args:
27+
filename: Original filename (may contain non-ASCII characters)
28+
29+
Returns:
30+
Content-Disposition header value
31+
"""
32+
try:
33+
# Check if filename contains non-ASCII characters
34+
try:
35+
filename.encode('ascii')
36+
has_non_ascii = False
37+
except UnicodeEncodeError:
38+
has_non_ascii = True
39+
40+
if has_non_ascii:
41+
# Use RFC 5987 format for UTF-8 filenames
42+
# Format: filename*=UTF-8''encoded_filename
43+
# URL-encode the filename for the filename* parameter
44+
encoded_filename = quote(filename, safe='')
45+
46+
# Create ASCII-compatible fallback filename
47+
# Extract file extension if available
48+
import os
49+
_, ext = os.path.splitext(filename)
50+
# Use a generic ASCII name with the same extension
51+
fallback_name = f"download{ext}" if ext else "download"
52+
53+
# Return header with both filename (ASCII fallback) and filename* (UTF-8)
54+
return f'attachment; filename="{fallback_name}"; filename*=UTF-8\'\'{encoded_filename}'
55+
else:
56+
# Pure ASCII filename, use simple format
57+
return f'attachment; filename="{filename}"'
58+
except Exception as e:
59+
logger.warning(f"Failed to encode filename '{filename}': {e}, using fallback")
60+
# Fallback: use generic name
61+
return f'attachment; filename="download"'
62+
1563
# Create API router
1664
file_management_runtime_router = APIRouter(prefix="/file")
1765
file_management_config_router = APIRouter(prefix="/file")
@@ -98,6 +146,62 @@ async def process_files(
98146
)
99147

100148

149+
@file_management_config_router.get("/download/{object_name:path}")
150+
async def get_storage_file(
151+
object_name: str = PathParam(..., description="File object name"),
152+
download: str = Query("ignore", description="How to get the file"),
153+
expires: int = Query(3600, description="URL validity period (seconds)"),
154+
filename: Optional[str] = Query(None, description="Original filename for download (optional)")
155+
):
156+
"""
157+
Get information, download link, or file stream for a single file
158+
159+
- **object_name**: File object name
160+
- **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
161+
- **expires**: URL validity period in seconds (default 3600)
162+
- **filename**: Original filename for download (optional, if not provided, will use object_name)
163+
164+
Returns file information, download link, or file content
165+
"""
166+
try:
167+
logger.info(f"[get_storage_file] Route matched! object_name={object_name}, download={download}, filename={filename}")
168+
if download == "redirect":
169+
# return a redirect download URL
170+
result = await get_file_url_impl(object_name=object_name, expires=expires)
171+
return RedirectResponse(url=result["url"])
172+
elif download == "stream":
173+
# return a readable file stream
174+
file_stream, content_type = await get_file_stream_impl(object_name=object_name)
175+
logger.info(f"Streaming file: object_name={object_name}, content_type={content_type}")
176+
177+
# Use provided filename or extract from object_name
178+
download_filename = filename
179+
if not download_filename:
180+
# Extract filename from object_name (get the last part after the last slash)
181+
download_filename = object_name.split("/")[-1] if "/" in object_name else object_name
182+
183+
# Build Content-Disposition header with proper encoding for non-ASCII characters
184+
content_disposition = build_content_disposition_header(download_filename)
185+
186+
return StreamingResponse(
187+
file_stream,
188+
media_type=content_type,
189+
headers={
190+
"Content-Disposition": content_disposition
191+
}
192+
)
193+
else:
194+
# return file metadata
195+
return await get_file_url_impl(object_name=object_name, expires=expires)
196+
except Exception as e:
197+
logger.error(f"Failed to get file: object_name={object_name}, error={str(e)}")
198+
raise HTTPException(
199+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
200+
detail=f"Failed to get file information: {str(e)}"
201+
)
202+
203+
204+
101205
@file_management_runtime_router.post("/storage")
102206
async def storage_upload_files(
103207
files: List[UploadFile] = File(..., description="List of files to upload"),
@@ -158,43 +262,177 @@ async def get_storage_files(
158262
)
159263

160264

161-
@file_management_config_router.get("/storage/{path}/{object_name}")
162-
async def get_storage_file(
163-
object_name: str = PathParam(..., description="File object name"),
164-
download: str = Query("ignore", description="How to get the file"),
165-
expires: int = Query(3600, description="URL validity period (seconds)")
265+
def _normalize_datamate_download_url(raw_url: str) -> str:
266+
"""
267+
Normalize Datamate download URL to ensure it follows /data-management/datasets/{datasetId}/files/{fileId}/download
268+
"""
269+
parsed_url = urlparse(raw_url)
270+
path_segments = [segment for segment in parsed_url.path.split("/") if segment]
271+
272+
if "data-management" not in path_segments:
273+
raise HTTPException(
274+
status_code=HTTPStatus.BAD_REQUEST,
275+
detail="Invalid Datamate URL: missing 'data-management' segment"
276+
)
277+
278+
try:
279+
dm_index = path_segments.index("data-management")
280+
datasets_index = path_segments.index("datasets", dm_index)
281+
dataset_id = path_segments[datasets_index + 1]
282+
files_index = path_segments.index("files", datasets_index)
283+
file_id = path_segments[files_index + 1]
284+
except (ValueError, IndexError):
285+
raise HTTPException(
286+
status_code=HTTPStatus.BAD_REQUEST,
287+
detail="Invalid Datamate URL: unable to parse dataset_id or file_id"
288+
)
289+
290+
prefix_segments = path_segments[:dm_index]
291+
prefix_path = "/" + "/".join(prefix_segments) if prefix_segments else ""
292+
normalized_path = f"{prefix_path}/data-management/datasets/{dataset_id}/files/{file_id}/download"
293+
294+
normalized_url = urlunparse((
295+
parsed_url.scheme,
296+
parsed_url.netloc,
297+
normalized_path,
298+
"",
299+
"",
300+
""
301+
))
302+
303+
return normalized_url
304+
305+
306+
def _build_datamate_url_from_parts(base_url: str, dataset_id: str, file_id: str) -> str:
307+
"""
308+
Build Datamate download URL from individual parts
309+
"""
310+
if not base_url:
311+
raise HTTPException(
312+
status_code=HTTPStatus.BAD_REQUEST,
313+
detail="base_url is required when dataset_id and file_id are provided"
314+
)
315+
316+
parsed_base = urlparse(base_url)
317+
base_prefix = parsed_base.path.rstrip("/")
318+
319+
if base_prefix and not base_prefix.endswith("/api"):
320+
if base_prefix.endswith("/"):
321+
base_prefix = f"{base_prefix}api"
322+
else:
323+
base_prefix = f"{base_prefix}/api"
324+
elif not base_prefix:
325+
base_prefix = "/api"
326+
327+
normalized_path = f"{base_prefix}/data-management/datasets/{dataset_id}/files/{file_id}/download"
328+
329+
return urlunparse((
330+
parsed_base.scheme,
331+
parsed_base.netloc,
332+
normalized_path,
333+
"",
334+
"",
335+
""
336+
))
337+
338+
339+
@file_management_config_router.get("/datamate/download")
340+
async def download_datamate_file(
341+
url: Optional[str] = Query(None, description="Datamate file URL to download"),
342+
base_url: Optional[str] = Query(None, description="Datamate base server URL (e.g., http://host:port or http://host:port/api)"),
343+
dataset_id: Optional[str] = Query(None, description="Datamate dataset ID"),
344+
file_id: Optional[str] = Query(None, description="Datamate file ID"),
345+
filename: Optional[str] = Query(None, description="Optional filename for download"),
346+
authorization: Optional[str] = Header(None, alias="Authorization")
166347
):
167348
"""
168-
Get information, download link, or file stream for a single file
349+
Download file from Datamate knowledge base via HTTP URL
169350
170-
- **object_name**: File object name
171-
- **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
172-
- **expires**: URL validity period in seconds (default 3600)
351+
- **url**: Full HTTP URL of the file to download (optional)
352+
- **base_url**: Base server URL (e.g., http://host:port or http://host:port/api)
353+
- **dataset_id**: Datamate dataset ID
354+
- **file_id**: Datamate file ID
355+
- **filename**: Optional filename for the download (extracted automatically if not provided)
356+
- **authorization**: Optional authorizatio n header to pass to the target URL
173357
174-
Returns file information, download link, or file content
358+
Returns file stream for download
175359
"""
176360
try:
177-
if download == "redirect":
178-
# return a redirect download URL
179-
result = await get_file_url_impl(object_name=object_name, expires=expires)
180-
return RedirectResponse(url=result["url"])
181-
elif download == "stream":
182-
# return a readable file stream
183-
file_stream, content_type = await get_file_stream_impl(object_name=object_name)
361+
if url:
362+
logger.info(f"[download_datamate_file] Using full URL: {url}")
363+
normalized_url = _normalize_datamate_download_url(url)
364+
elif base_url and dataset_id and file_id:
365+
logger.info(f"[download_datamate_file] Building URL from parts: base_url={base_url}, dataset_id={dataset_id}, file_id={file_id}")
366+
normalized_url = _build_datamate_url_from_parts(base_url, dataset_id, file_id)
367+
else:
368+
raise HTTPException(
369+
status_code=HTTPStatus.BAD_REQUEST,
370+
detail="Either url or (base_url, dataset_id, file_id) must be provided"
371+
)
372+
373+
logger.info(f"[download_datamate_file] Normalized download URL: {normalized_url}")
374+
logger.info(f"[download_datamate_file] Authorization header present: {authorization is not None}")
375+
376+
headers = {}
377+
if authorization:
378+
headers["Authorization"] = authorization
379+
logger.debug(f"[download_datamate_file] Using authorization header: {authorization[:20]}...")
380+
headers["User-Agent"] = "Nexent-File-Downloader/1.0"
381+
382+
logger.info(f"[download_datamate_file] Request headers: {list(headers.keys())}")
383+
384+
async with httpx.AsyncClient(timeout=30.0) as client:
385+
response = await client.get(normalized_url, headers=headers, follow_redirects=True)
386+
logger.info(f"[download_datamate_file] Response status: {response.status_code}")
387+
388+
if response.status_code == 404:
389+
logger.error(f"[download_datamate_file] File not found at URL: {normalized_url}")
390+
logger.error(f"[download_datamate_file] Response headers: {dict(response.headers)}")
391+
raise HTTPException(
392+
status_code=HTTPStatus.NOT_FOUND,
393+
detail="File not found. Please verify dataset_id and file_id."
394+
)
395+
396+
response.raise_for_status()
397+
398+
content_type = response.headers.get("Content-Type", "application/octet-stream")
399+
400+
download_filename = filename
401+
if not download_filename:
402+
content_disposition = response.headers.get("Content-Disposition", "")
403+
if content_disposition:
404+
import re
405+
filename_match = re.search(r'filename="?(.+?)"?$', content_disposition)
406+
if filename_match:
407+
download_filename = filename_match.group(1)
408+
409+
if not download_filename:
410+
path = unquote(urlparse(normalized_url).path)
411+
download_filename = path.split('/')[-1] or "download"
412+
413+
# Build Content-Disposition header with proper encoding for non-ASCII characters
414+
content_disposition = build_content_disposition_header(download_filename)
415+
184416
return StreamingResponse(
185-
file_stream,
417+
iter([response.content]),
186418
media_type=content_type,
187419
headers={
188-
"Content-Disposition": f'inline; filename="{object_name}"'
420+
"Content-Disposition": content_disposition
189421
}
190422
)
191-
else:
192-
# return file metadata
193-
return await get_file_url_impl(object_name=object_name, expires=expires)
423+
except httpx.HTTPError as e:
424+
logger.error(f"Failed to download file from URL {url}: {str(e)}")
425+
raise HTTPException(
426+
status_code=HTTPStatus.BAD_GATEWAY,
427+
detail=f"Failed to download file from URL: {str(e)}"
428+
)
429+
except HTTPException:
430+
raise
194431
except Exception as e:
432+
logger.error(f"Failed to download datamate file: {str(e)}")
195433
raise HTTPException(
196434
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
197-
detail=f"Failed to get file information: {str(e)}"
435+
detail=f"Failed to download file: {str(e)}"
198436
)
199437

200438

0 commit comments

Comments
 (0)