Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 134 additions & 39 deletions backend/apps/file_management_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@
import re
import base64
from http import HTTPStatus
from typing import List, Optional
from typing import Annotated, List, Optional
from urllib.parse import urlparse, urlunparse, unquote, quote

import httpx
from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile
from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
from starlette.background import BackgroundTask

from consts.exceptions import FileTooLargeException, NotFoundException, OfficeConversionException, UnsupportedFileTypeException
from consts.exceptions import FileTooLargeException, NotFoundException, UnsupportedFileTypeException
from consts.model import ProcessParams
from services.file_management_service import upload_to_minio, upload_files_impl, \
get_file_url_impl, get_file_stream_impl, delete_file_impl, list_files_impl, \
preview_file_impl
resolve_preview_file, get_preview_stream
from utils.file_management_utils import trigger_data_process

logger = logging.getLogger("file_management_app")
Expand Down Expand Up @@ -578,38 +579,20 @@ async def get_storage_file_batch_urls(
@file_management_config_router.get("/preview/{object_name:path}")
async def preview_file(
object_name: str = PathParam(..., description="File object name to preview"),
filename: Optional[str] = Query(None, description="Original filename for display (optional)")
filename: Annotated[Optional[str], Query(description="Original filename for display (optional)")] = None,
range_header: Annotated[Optional[str], Header(alias="range")] = None,
):
"""
Preview file inline in browser
Preview file inline in browser

- **object_name**: File object name in storage
- **filename**: Original filename for Content-Disposition header (optional)

Returns file stream with Content-Disposition: inline for browser preview

Supports HTTP Range requests (RFC 7233) for partial content delivery.
Returns 206 Partial Content when a valid Range header is present.
"""
try:
# Get file stream from preview service
file_stream, content_type = await preview_file_impl(object_name=object_name)

# Use provided filename or extract from object_name
display_filename = filename
if not display_filename:
display_filename = object_name.split("/")[-1] if "/" in object_name else object_name

# Build Content-Disposition header for inline display
content_disposition = build_content_disposition_header(display_filename, inline=True)

return StreamingResponse(
file_stream,
media_type=content_type,
headers={
"Content-Disposition": content_disposition,
"Cache-Control": "public, max-age=3600",
"ETag": f'"{object_name}"',
}
)

actual_name, content_type, total_size = await resolve_preview_file(object_name=object_name)
except FileTooLargeException as e:
logger.warning(f"[preview_file] File too large: object_name={object_name}, error={str(e)}")
raise HTTPException(
Expand All @@ -625,18 +608,130 @@ async def preview_file(
except UnsupportedFileTypeException as e:
logger.error(f"[preview_file] Unsupported file type: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
status_code=HTTPStatus.BAD_REQUEST,
detail=f"File format not supported for preview: {str(e)}"
)
except OfficeConversionException as e:
logger.error(f"[preview_file] Conversion failed: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Failed to preview file: {str(e)}"
)
except Exception as e:
logger.error(f"[preview_file] Unexpected error: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Failed to preview file: {str(e)}"
)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail="Failed to preview file"
)

display_filename = filename or (object_name.split("/")[-1] if "/" in object_name else object_name)
content_disposition = build_content_disposition_header(display_filename, inline=True)

common_headers = {
"Content-Disposition": content_disposition,
"Accept-Ranges": "bytes",
"Cache-Control": "public, max-age=3600",
"ETag": f'"{object_name}"',
}

if total_size == 0:
return StreamingResponse(
iter([]),
status_code=HTTPStatus.OK,
media_type=content_type,
headers={
**common_headers,
"Content-Length": "0",
},
)

# Parse Range header
start, end = None, None
if range_header:
parsed = _parse_range_header(range_header, total_size)
if parsed is None:
return StreamingResponse(
iter([]),
status_code=HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE,
headers={"Content-Range": f"bytes */{total_size}"},
)
start, end = parsed

try:
if start is not None:
# 206 Partial Content
stream = get_preview_stream(actual_name, start, end)
return StreamingResponse(
stream.iter_chunks(chunk_size=64 * 1024),
status_code=HTTPStatus.PARTIAL_CONTENT,
media_type=content_type,
background=BackgroundTask(stream.close),
headers={
**common_headers,
"Content-Range": f"bytes {start}-{end}/{total_size}",
"Content-Length": str(end - start + 1),
},
)
else:
# 200 Full Content — no Range header present.
stream = get_preview_stream(actual_name)
return StreamingResponse(
stream.iter_chunks(chunk_size=64 * 1024),
status_code=HTTPStatus.OK,
media_type=content_type,
background=BackgroundTask(stream.close),
headers={
**common_headers,
"Content-Length": str(total_size),
},
)
except NotFoundException as e:
logger.error(f"[preview_file] File not found when streaming: object_name={object_name}, error={str(e)}")
raise HTTPException(status_code=HTTPStatus.NOT_FOUND, detail=f"File not found: {object_name}")
except Exception as e:
logger.error(f"[preview_file] Unexpected error when streaming: object_name={object_name}, error={str(e)}")
raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail="Failed to preview file")


def _parse_range_header(range_header: str, total_size: int) -> Optional[tuple]:
"""
Parse an HTTP Range header and return (start, end) byte offsets (both inclusive).

Supports:
- bytes=start-end
- bytes=start- (to end of file)
- bytes=-suffix (last N bytes)

Returns None if the range is malformed or not satisfiable.
"""
try:
if total_size <= 0:
return None
if not range_header.startswith("bytes="):
return None
range_spec = range_header[6:].strip()
if "-" not in range_spec:
return None
start_str, end_str = range_spec.split("-", 1)
start_str = start_str.strip()
end_str = end_str.strip()

if start_str == "":
# Suffix range: bytes=-N
if not end_str:
return None
suffix = int(end_str)
start = max(0, total_size - suffix)
end = total_size - 1
elif end_str == "":
# Open-ended range: bytes=N-
start = int(start_str)
end = total_size - 1
else:
start = int(start_str)
end = int(end_str)

# Clamp end to last byte (RFC 7233 §2.1 allows end to exceed file size)
end = min(end, total_size - 1)

# Validate bounds
if start < 0 or start >= total_size or end < start:
return None

return start, end
except (ValueError, AttributeError):
return None
6 changes: 6 additions & 0 deletions backend/consts/const.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from enum import Enum
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
Expand Down Expand Up @@ -40,6 +41,11 @@ class VectorDatabaseType(str, Enum):
FILE_PREVIEW_SIZE_LIMIT = 100 * 1024 * 1024 # 100MB
# Limit concurrent Office-to-PDF conversions
MAX_CONCURRENT_CONVERSIONS = 5
# LibreOffice profile directory
LIBREOFFICE_PROFILE_DIR = os.getenv(
"LIBREOFFICE_PROFILE_DIR",
str(Path.home() / ".cache" / "nexent" / "libreoffice-profile"),
)
# Supported Office file MIME types
OFFICE_MIME_TYPES = [
'application/msword', # .doc
Expand Down
32 changes: 32 additions & 0 deletions backend/database/attachment_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,38 @@ def get_file_stream(object_name: str, bucket: Optional[str] = None) -> Optional[
return None


def get_file_stream_raw(object_name: str, bucket: Optional[str] = None) -> Optional[Any]:
"""
Get raw stream object from MinIO storage without reading it into memory.

Args:
object_name: Object name in MinIO
bucket: Bucket name, if not specified use default bucket

Returns:
Raw boto3 Body stream on success, or None if failed
"""
success, result = minio_client.get_file_stream(object_name, bucket)
return result if success else None


def get_file_range(object_name: str, start: int, end: int, bucket: Optional[str] = None) -> Optional[Any]:
"""
Get a byte-range stream from MinIO storage.

Args:
object_name: Object name in MinIO
start: Start byte offset (inclusive)
end: End byte offset (inclusive), matching HTTP Range semantics.
bucket: Bucket name, if not specified use default bucket

Returns:
Raw boto3 Body stream on success, or None if failed
"""
success, result = minio_client.get_file_range(object_name, start, end, bucket)
return result if success else None


def get_content_type(file_path: str) -> str:
"""
Get content type based on file extension
Expand Down
15 changes: 15 additions & 0 deletions backend/database/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,21 @@ def get_file_stream(self, object_name: str, bucket: Optional[str] = None) -> Tup
"""
return self._storage_client.get_file_stream(object_name, bucket)

def get_file_range(self, object_name: str, start: int, end: int, bucket: Optional[str] = None) -> Tuple[bool, Any]:
"""
Get a byte-range slice of a file from MinIO.

Args:
object_name: Object name
start: Start byte offset (inclusive)
end: End byte offset (inclusive), matching HTTP Range semantics
bucket: Bucket name, if not specified use default bucket

Returns:
Tuple[bool, Any]: (True, raw_body_stream) on success, (False, error_str) on failure
"""
return self._storage_client.get_file_range(object_name, start, end, bucket)

def file_exists(self, object_name: str, bucket: Optional[str] = None) -> bool:
"""
Check if file exists in MinIO
Expand Down
2 changes: 1 addition & 1 deletion backend/services/data_process_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ async def convert_office_to_pdf_impl(self, object_name: str, pdf_object_name: st
original_filename = os.path.basename(object_name)
input_path = os.path.join(temp_dir, original_filename)
with open(input_path, 'wb') as f:
while chunk := original_stream.read(8192):
while chunk := original_stream.read(1024 * 1024):
f.write(chunk)

# Step 2: Local conversion using LibreOffice
Expand Down
Loading
Loading