diff --git a/aperag/api/openapi.merged.yaml b/aperag/api/openapi.merged.yaml deleted file mode 100644 index e69de29bb..000000000 diff --git a/aperag/api/openapi.yaml b/aperag/api/openapi.yaml index ecb62d770..f4a3a7862 100644 --- a/aperag/api/openapi.yaml +++ b/aperag/api/openapi.yaml @@ -67,6 +67,8 @@ paths: $ref: './paths/collections.yaml#/document_preview' /collections/{collection_id}/documents/{document_id}/object: $ref: './paths/collections.yaml#/document_object' + /collections/{collection_id}/documents/{document_id}/download: + $ref: './paths/collections.yaml#/document_download' /collections/{collection_id}/documents/upload: $ref: './paths/collections.yaml#/upload_document' /collections/{collection_id}/documents/confirm: diff --git a/aperag/api/paths/collections.yaml b/aperag/api/paths/collections.yaml index 2c39ab020..402bc4ad4 100644 --- a/aperag/api/paths/collections.yaml +++ b/aperag/api/paths/collections.yaml @@ -398,6 +398,94 @@ document_object: schema: $ref: '../components/schemas/common.yaml#/failResponse' +document_download: + get: + summary: Download document file + description: | + Download the original document file. + Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. + The file is streamed through the backend to support internal network deployments and maintain access control. + + **Document Lifecycle and Download Availability:** + - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) + - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) + - EXPIRED: Not downloadable (file deleted by cleanup task) + - DELETED: Not downloadable (soft-deleted by user) + + **Auto-Cleanup Mechanism:** + A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. + Once confirmed, documents will never be auto-deleted. + operationId: download_document + tags: + - documents + security: + - BearerAuth: [] + parameters: + - name: collection_id + in: path + required: true + schema: + type: string + description: Collection ID + - name: document_id + in: path + required: true + schema: + type: string + description: Document ID + responses: + '200': + description: Document file stream + content: + application/octet-stream: + schema: + type: string + format: binary + headers: + Content-Type: + description: MIME type of the document (e.g., application/pdf, text/plain) + schema: + type: string + Content-Disposition: + description: Attachment header with original filename + schema: + type: string + example: 'attachment; filename="document.pdf"' + Content-Length: + description: Size of the file in bytes + schema: + type: integer + '400': + description: Bad request - document status does not allow download (EXPIRED or DELETED) + content: + application/json: + schema: + $ref: '../components/schemas/common.yaml#/failResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '../components/schemas/common.yaml#/failResponse' + '403': + description: Forbidden - user does not have access to this document + content: + application/json: + schema: + $ref: '../components/schemas/common.yaml#/failResponse' + '404': + description: Document not found or file not found in storage + content: + application/json: + schema: + $ref: '../components/schemas/common.yaml#/failResponse' + '500': + description: Internal server error - failed to download from storage + content: + application/json: + schema: + $ref: '../components/schemas/common.yaml#/failResponse' + rebuild_indexes: post: summary: Rebuild document indexes diff --git a/aperag/schema/view_models.py b/aperag/schema/view_models.py index a1eab58b7..0b5ee735d 100644 --- a/aperag/schema/view_models.py +++ b/aperag/schema/view_models.py @@ -14,7 +14,7 @@ # generated by datamodel-codegen: # filename: openapi.merged.yaml -# timestamp: 2025-11-11T06:17:00+00:00 +# timestamp: 2026-01-13T12:50:23+00:00 from __future__ import annotations diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 5e344dff9..cd96c640d 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -1004,6 +1004,80 @@ async def _get_document_preview(session: AsyncSession): # Execute query with proper session management return await self.db_ops._execute_query(_get_document_preview) + async def download_document(self, user_id: str, collection_id: str, document_id: str): + """ + Download the original document file. + Returns a StreamingResponse with the file content. + """ + + async def _download_document(session): + # 1. Verify user has access to the document + stmt = select(db_models.Document).filter( + db_models.Document.id == document_id, + db_models.Document.collection_id == collection_id, + db_models.Document.user == user_id, + db_models.Document.gmt_deleted.is_(None), # Only allow downloading non-deleted documents + ) + result = await session.execute(stmt) + document = result.scalars().first() + if not document: + raise DocumentNotFoundException(document_id) + + # 2. Check document status - only disallow downloading expired/deleted documents + # UPLOADED documents can be downloaded (before confirmation, within 24 hours) + # Once expired or deleted, files may no longer exist in storage + if document.status in [db_models.DocumentStatus.EXPIRED, db_models.DocumentStatus.DELETED]: + raise HTTPException( + status_code=400, detail=f"Document status is {document.status.value}, cannot download" + ) + + # 3. Get object path from doc_metadata + try: + metadata = json.loads(document.doc_metadata) if document.doc_metadata else {} + object_path = metadata.get("object_path") + if not object_path: + raise HTTPException(status_code=404, detail="Document file not found in storage") + except json.JSONDecodeError: + logger.error(f"Invalid JSON in doc_metadata for document {document_id}") + raise HTTPException(status_code=500, detail="Document metadata is corrupted") + + # 4. Stream file from object store + try: + async_obj_store = get_async_object_store() + + # Get file stream and size + get_result = await async_obj_store.get(object_path) + if not get_result: + raise HTTPException(status_code=404, detail="Document file not found in object store") + + data_stream, file_size = get_result + + # Determine content type from filename + content_type, _ = mimetypes.guess_type(document.name) + if content_type is None: + content_type = "application/octet-stream" + + # Set headers for file download + headers = { + "Content-Type": content_type, + "Content-Disposition": f'attachment; filename="{document.name}"', + "Content-Length": str(file_size), + } + + logger.info( + f"User {user_id} downloading document {document_id} ({document.name}) " + f"from collection {collection_id}, size: {file_size} bytes" + ) + + return StreamingResponse(data_stream, headers=headers) + + except Exception as e: + logger.error(f"Failed to download document {document_id} from path {object_path}: {e}", exc_info=True) + raise HTTPException(status_code=500, detail="Failed to download document from storage") + + # Execute query with proper session management + return await self.db_ops._execute_query(_download_document) + async def get_document_object( self, user_id: str, collection_id: str, document_id: str, path: str, range_header: str = None ): diff --git a/aperag/views/collections.py b/aperag/views/collections.py index d05d91451..beb8b0f9f 100644 --- a/aperag/views/collections.py +++ b/aperag/views/collections.py @@ -269,6 +269,21 @@ async def get_document_view( return await document_service.get_document(str(user.id), collection_id, document_id) +@router.get("/collections/{collection_id}/documents/{document_id}/download", tags=["documents"]) +@audit(resource_type="document", api_name="DownloadDocument") +async def download_document_view( + request: Request, + collection_id: str, + document_id: str, + user: User = Depends(required_user), +): + """ + Download the original document file. + Returns the file as a streaming response with appropriate headers. + """ + return await document_service.download_document(str(user.id), collection_id, document_id) + + @router.delete("/collections/{collection_id}/documents/{document_id}", tags=["documents"]) @audit(resource_type="document", api_name="DeleteDocument") async def delete_document_view( diff --git a/docs/design/document_export_design_zh.md b/docs/design/document_export_design_zh.md index 99018e835..655cdcfe5 100644 --- a/docs/design/document_export_design_zh.md +++ b/docs/design/document_export_design_zh.md @@ -25,8 +25,8 @@ │ └──────────────────┘ └──────────────────┘ │ └─────────┬──────────────────────────┬────────────────────────┘ │ │ - │ GET /documents/{id}/download (同步,流式返回) - │ POST /collections/{id}/export (异步,生成下载链接) + │ GET /collections/{id}/documents/{id}/download (同步,流式返回) + │ POST /collections/{id}/export (异步,生成下载链接) ▼ ▼ ┌─────────────────────────────────────────────────────────────┐ │ View Layer │ @@ -111,7 +111,7 @@ | 场景 | API | 模式 | 说明 | |------|-----|------|------| -| **单个文档下载** | `GET /documents/{id}/download` | 同步流式 | 直接返回文件流 | +| **单个文档下载** | `GET /collections/{collection_id}/documents/{id}/download` | 同步流式 | 直接返回文件流 | | **知识库导出** | `POST /collections/{id}/export` | 异步 | 生成后端下载 URL | ## 核心流程详解 @@ -124,26 +124,32 @@ 用户点击"下载"按钮 │ ▼ -GET /api/v1/documents/{document_id}/download +GET /api/v1/collections/{collection_id}/documents/{document_id}/download │ ▼ 后端处理: │ ├─► 验证用户身份(JWT) │ - ├─► 验证文档访问权限 + ├─► 验证文档访问权限(user、collection_id 匹配) │ - ├─► 查询 Document 记录 + ├─► 查询 Document 记录(过滤软删除文档) │ - ├─► 从 doc_metadata 获取 object_path + ├─► 检查文档状态(只禁止 EXPIRED/DELETED 状态) + │ ├─ UPLOADED: ✅ 允许下载(上传后 24 小时内) + │ ├─ PENDING/RUNNING/COMPLETE/FAILED: ✅ 允许下载(永久) + │ ├─ EXPIRED: ❌ 禁止(文件已被清理) + │ └─ DELETED: ❌ 禁止(用户已删除) + │ + ├─► 从 doc_metadata JSON 获取 object_path │ ├─► 从对象存储读取文件(流式) - │ └─ 路径:user-{user_id}/{collection_id}/{doc_id}/original.pdf + │ └─ 路径:user-{user_id}/{collection_id}/{doc_id}/original.xxx │ └─► 返回 StreamingResponse - ├─ Content-Type: application/octet-stream - ├─ Content-Disposition: attachment; filename="xxx.pdf" - └─ Transfer-Encoding: chunked (流式传输) + ├─ Content-Type: 根据文件扩展名判断(默认 application/octet-stream) + ├─ Content-Disposition: attachment; filename="原始文件名" + └─ Content-Length: 文件大小(从对象存储获取) │ ▼ 文件通过后端流式传输给客户端 @@ -156,7 +162,7 @@ GET /api/v1/documents/{document_id}/download **请求**: ```http -GET /api/v1/documents/{document_id}/download +GET /api/v1/collections/{collection_id}/documents/{document_id}/download Authorization: Bearer {token} ``` @@ -166,11 +172,14 @@ HTTP/1.1 200 OK Content-Type: application/octet-stream Content-Disposition: attachment; filename="user_manual.pdf" Content-Length: 5242880 -Transfer-Encoding: chunked [文件二进制流] ``` +**说明**: +- 实际实现中不使用 `Transfer-Encoding: chunked` 响应头,而是通过 FastAPI 的 `StreamingResponse` 自动处理流式传输 +- `Content-Length` 会从对象存储获取文件大小后设置 + #### 1.3 关键特性 - **流式读取**:从对象存储按块读取(chunk size = 64KB) @@ -179,6 +188,58 @@ Transfer-Encoding: chunked - **超时控制**:设置合理的读取超时(如 30 分钟) - **权限控制**:每次下载都验证用户权限 - **审计日志**:记录下载操作(用户、时间、文档) +- **状态检查**:只禁止下载 EXPIRED/DELETED 状态的文档 + +#### 1.4 文档生命周期与下载可用性 + +**文档状态说明**: + +| 状态 | 说明 | 可下载 | 自动清理 | 触发条件 | +|------|------|--------|----------|----------| +| `UPLOADED` | 已上传,未确认 | ✅ | 是(24小时后) | 用户上传文件 | +| `PENDING` | 已确认,等待处理 | ✅ | 否 | 用户确认文档 | +| `RUNNING` | 正在处理索引 | ✅ | 否 | 后台任务开始处理 | +| `COMPLETE` | 处理完成 | ✅ | 否 | 索引创建成功 | +| `FAILED` | 处理失败 | ✅ | 否 | 索引创建失败 | +| `EXPIRED` | 已过期 | ❌ | - | 自动清理任务 | +| `DELETED` | 已删除 | ❌ | - | 用户删除操作 | + +**自动清理机制**: + +``` +定时任务:每 10 分钟运行一次 +清理目标:UPLOADED 状态 且 创建时间 > 24 小时 的文档 +清理操作: + 1. 删除对象存储中的文件(包括所有相关文件) + 2. 将文档状态更新为 EXPIRED + 3. 记录清理日志 + +配置位置:config/celery.py +任务名称:cleanup_expired_documents_task +执行频率:600 秒(10 分钟) +``` + +**设计理念**: +- ✅ **用户友好**:上传后即可下载预览,无需等待确认 +- ✅ **资源优化**:未确认的临时文件自动清理,节省存储空间 +- ✅ **数据安全**:确认后的文档永久保留,不会被自动删除 +- ✅ **清晰提示**:EXPIRED 状态的文档返回明确错误信息 + +**典型使用流程**: + +``` +1. 用户上传文档 + └─► 状态:UPLOADED(可下载,24小时有效期) + +2. 场景 A:用户及时确认(< 24 小时) + └─► 状态:PENDING → RUNNING → COMPLETE + └─► 可永久下载,不会被清理 ✅ + +3. 场景 B:用户未及时确认(> 24 小时) + └─► 自动清理任务执行 + └─► 状态:EXPIRED(无法下载) ❌ + └─► 用户需要重新上传 +``` ### 场景 2: 知识库导出(异步打包) @@ -617,7 +678,7 @@ Celery 任务超时: | 方法 | 路径 | 说明 | 模式 | |------|------|------|------| -| GET | `/documents/{id}/download` | 下载单个文档 | 同步流式 | +| GET | `/collections/{collection_id}/documents/{id}/download` | 下载单个文档 | 同步流式 | | POST | `/collections/{id}/export` | 知识库导出 | 异步 | | GET | `/export-tasks/{id}` | 查询导出任务状态 | - | | GET | `/export-tasks/{id}/download` | 下载导出结果 | 同步流式 | diff --git a/tests/e2e_test/test_document_download.py b/tests/e2e_test/test_document_download.py new file mode 100644 index 000000000..883c8a0d6 --- /dev/null +++ b/tests/e2e_test/test_document_download.py @@ -0,0 +1,153 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +End-to-end tests for document download functionality +""" + +import time +from http import HTTPStatus + + +def test_download_document(client, collection): + """Test downloading a document file""" + # Upload a test document + test_content = b"This is a test document for download testing. Hello ApeRAG!" + files = {"files": ("test_download.txt", test_content, "text/plain")} + upload_resp = client.post(f"/api/v1/collections/{collection['id']}/documents", files=files) + assert upload_resp.status_code == HTTPStatus.OK, upload_resp.text + resp_data = upload_resp.json() + assert len(resp_data["items"]) == 1 + doc_id = resp_data["items"][0]["id"] + + # Wait for document to be processed + max_wait = 30 + interval = 2 + for _ in range(max_wait // interval): + get_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert get_resp.status_code == HTTPStatus.OK, get_resp.text + data = get_resp.json() + status = data.get("status") + if status in ["COMPLETE", "FAILED", "RUNNING"]: + break + time.sleep(interval) + + # Download the document + download_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}/download") + assert download_resp.status_code == HTTPStatus.OK, download_resp.text + + # Verify response headers + assert "content-type" in download_resp.headers + assert "content-disposition" in download_resp.headers + assert "attachment" in download_resp.headers["content-disposition"] + assert "test_download.txt" in download_resp.headers["content-disposition"] + + # Verify content + downloaded_content = download_resp.content + assert downloaded_content == test_content, "Downloaded content should match uploaded content" + + # Cleanup: Delete document + delete_resp = client.delete(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert delete_resp.status_code == HTTPStatus.OK, delete_resp.text + + +def test_download_nonexistent_document(client, collection): + """Test downloading a non-existent document""" + fake_doc_id = "doc_nonexistent12345" + download_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{fake_doc_id}/download") + assert download_resp.status_code == HTTPStatus.NOT_FOUND, download_resp.text + + +def test_download_deleted_document(client, collection): + """Test downloading a deleted document""" + # Upload a test document + test_content = b"This document will be deleted before download." + files = {"files": ("test_deleted.txt", test_content, "text/plain")} + upload_resp = client.post(f"/api/v1/collections/{collection['id']}/documents", files=files) + assert upload_resp.status_code == HTTPStatus.OK, upload_resp.text + resp_data = upload_resp.json() + assert len(resp_data["items"]) == 1 + doc_id = resp_data["items"][0]["id"] + + # Wait for document to be processed + max_wait = 30 + interval = 2 + for _ in range(max_wait // interval): + get_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert get_resp.status_code == HTTPStatus.OK, get_resp.text + data = get_resp.json() + status = data.get("status") + if status in ["COMPLETE", "FAILED", "RUNNING"]: + break + time.sleep(interval) + + # Delete the document + delete_resp = client.delete(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert delete_resp.status_code == HTTPStatus.OK, delete_resp.text + + # Try to download the deleted document (should fail) + download_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}/download") + assert download_resp.status_code == HTTPStatus.NOT_FOUND, download_resp.text + + +def test_download_pdf_document(client, collection): + """Test downloading a PDF document with correct content type""" + # Create a minimal PDF content (just for testing headers, not a real PDF) + # In real scenario, you would upload an actual PDF file + test_pdf_content = b"%PDF-1.4\n%Test PDF content\n%%EOF" + files = {"files": ("test_document.pdf", test_pdf_content, "application/pdf")} + upload_resp = client.post(f"/api/v1/collections/{collection['id']}/documents", files=files) + assert upload_resp.status_code == HTTPStatus.OK, upload_resp.text + resp_data = upload_resp.json() + assert len(resp_data["items"]) == 1 + doc_id = resp_data["items"][0]["id"] + + # Wait for document to be processed + max_wait = 30 + interval = 2 + for _ in range(max_wait // interval): + get_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert get_resp.status_code == HTTPStatus.OK, get_resp.text + data = get_resp.json() + status = data.get("status") + if status in ["COMPLETE", "FAILED", "RUNNING"]: + break + time.sleep(interval) + + # Download the document + download_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}/download") + assert download_resp.status_code == HTTPStatus.OK, download_resp.text + + # Verify response headers - content type should be PDF + assert "content-type" in download_resp.headers + assert "pdf" in download_resp.headers["content-type"].lower() + assert "content-disposition" in download_resp.headers + assert "test_document.pdf" in download_resp.headers["content-disposition"] + + # Cleanup: Delete document + delete_resp = client.delete(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") + assert delete_resp.status_code == HTTPStatus.OK, delete_resp.text + + +def test_download_unauthorized_access(client, collection): + """Test downloading a document from another user's collection (should fail)""" + # This test assumes there's a way to create documents under different users + # For now, we just test that the endpoint requires authentication + # by attempting to access with wrong collection_id + fake_collection_id = "col_unauthorized123" + fake_doc_id = "doc_unauthorized123" + + download_resp = client.get(f"/api/v1/collections/{fake_collection_id}/documents/{fake_doc_id}/download") + # Should return 404 (not found) or 403 (forbidden) depending on implementation + assert download_resp.status_code in [HTTPStatus.NOT_FOUND, HTTPStatus.FORBIDDEN], download_resp.text diff --git a/web/src/api/api.ts b/web/src/api/api.ts index 12b84ac2c..195194231 100644 --- a/web/src/api/api.ts +++ b/web/src/api/api.ts @@ -17,6 +17,7 @@ export * from './apis/audit-api'; export * from './apis/chat-documents-api'; export * from './apis/default-api'; +export * from './apis/documents-api'; export * from './apis/evaluation-api'; export * from './apis/graph-api'; export * from './apis/llmapi'; diff --git a/web/src/api/apis/documents-api.ts b/web/src/api/apis/documents-api.ts new file mode 100644 index 000000000..e07ee9ede --- /dev/null +++ b/web/src/api/apis/documents-api.ts @@ -0,0 +1,179 @@ +/* tslint:disable */ +/* eslint-disable */ +/** + * ApeRAG API + * ApeRAG API Documentation + * + * The version of the OpenAPI document: 1.0.0 + * + * + * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). + * https://openapi-generator.tech + * Do not edit the class manually. + */ + + +import type { Configuration } from '../configuration'; +import type { AxiosPromise, AxiosInstance, RawAxiosRequestConfig } from 'axios'; +import globalAxios from 'axios'; +// Some imports not used depending on template conditions +// @ts-ignore +import { DUMMY_BASE_URL, assertParamExists, setApiKeyToObject, setBasicAuthToObject, setBearerAuthToObject, setOAuthToObject, setSearchParams, serializeDataIfNeeded, toPathString, createRequestFunction } from '../common'; +// @ts-ignore +import { BASE_PATH, COLLECTION_FORMATS, type RequestArgs, BaseAPI, RequiredError, operationServerMap } from '../base'; +// @ts-ignore +import type { FailResponse } from '../models'; +/** + * DocumentsApi - axios parameter creator + * @export + */ +export const DocumentsApiAxiosParamCreator = function (configuration?: Configuration) { + return { + /** + * Download the original document file. Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. The file is streamed through the backend to support internal network deployments and maintain access control. **Document Lifecycle and Download Availability:** - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) - EXPIRED: Not downloadable (file deleted by cleanup task) - DELETED: Not downloadable (soft-deleted by user) **Auto-Cleanup Mechanism:** A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. Once confirmed, documents will never be auto-deleted. + * @summary Download document file + * @param {string} collectionId Collection ID + * @param {string} documentId Document ID + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + downloadDocument: async (collectionId: string, documentId: string, options: RawAxiosRequestConfig = {}): Promise => { + // verify required parameter 'collectionId' is not null or undefined + assertParamExists('downloadDocument', 'collectionId', collectionId) + // verify required parameter 'documentId' is not null or undefined + assertParamExists('downloadDocument', 'documentId', documentId) + const localVarPath = `/collections/{collection_id}/documents/{document_id}/download` + .replace(`{${"collection_id"}}`, encodeURIComponent(String(collectionId))) + .replace(`{${"document_id"}}`, encodeURIComponent(String(documentId))); + // use dummy base URL string because the URL constructor only accepts absolute URLs. + const localVarUrlObj = new URL(localVarPath, DUMMY_BASE_URL); + let baseOptions; + if (configuration) { + baseOptions = configuration.baseOptions; + } + + const localVarRequestOptions = { method: 'GET', ...baseOptions, ...options}; + const localVarHeaderParameter = {} as any; + const localVarQueryParameter = {} as any; + + // authentication BearerAuth required + // http bearer authentication required + await setBearerAuthToObject(localVarHeaderParameter, configuration) + + + + setSearchParams(localVarUrlObj, localVarQueryParameter); + let headersFromBaseOptions = baseOptions && baseOptions.headers ? baseOptions.headers : {}; + localVarRequestOptions.headers = {...localVarHeaderParameter, ...headersFromBaseOptions, ...options.headers}; + + return { + url: toPathString(localVarUrlObj), + options: localVarRequestOptions, + }; + }, + } +}; + +/** + * DocumentsApi - functional programming interface + * @export + */ +export const DocumentsApiFp = function(configuration?: Configuration) { + const localVarAxiosParamCreator = DocumentsApiAxiosParamCreator(configuration) + return { + /** + * Download the original document file. Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. The file is streamed through the backend to support internal network deployments and maintain access control. **Document Lifecycle and Download Availability:** - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) - EXPIRED: Not downloadable (file deleted by cleanup task) - DELETED: Not downloadable (soft-deleted by user) **Auto-Cleanup Mechanism:** A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. Once confirmed, documents will never be auto-deleted. + * @summary Download document file + * @param {string} collectionId Collection ID + * @param {string} documentId Document ID + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + async downloadDocument(collectionId: string, documentId: string, options?: RawAxiosRequestConfig): Promise<(axios?: AxiosInstance, basePath?: string) => AxiosPromise> { + const localVarAxiosArgs = await localVarAxiosParamCreator.downloadDocument(collectionId, documentId, options); + const localVarOperationServerIndex = configuration?.serverIndex ?? 0; + const localVarOperationServerBasePath = operationServerMap['DocumentsApi.downloadDocument']?.[localVarOperationServerIndex]?.url; + return (axios, basePath) => createRequestFunction(localVarAxiosArgs, globalAxios, BASE_PATH, configuration)(axios, localVarOperationServerBasePath || basePath); + }, + } +}; + +/** + * DocumentsApi - factory interface + * @export + */ +export const DocumentsApiFactory = function (configuration?: Configuration, basePath?: string, axios?: AxiosInstance) { + const localVarFp = DocumentsApiFp(configuration) + return { + /** + * Download the original document file. Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. The file is streamed through the backend to support internal network deployments and maintain access control. **Document Lifecycle and Download Availability:** - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) - EXPIRED: Not downloadable (file deleted by cleanup task) - DELETED: Not downloadable (soft-deleted by user) **Auto-Cleanup Mechanism:** A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. Once confirmed, documents will never be auto-deleted. + * @summary Download document file + * @param {DocumentsApiDownloadDocumentRequest} requestParameters Request parameters. + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + downloadDocument(requestParameters: DocumentsApiDownloadDocumentRequest, options?: RawAxiosRequestConfig): AxiosPromise { + return localVarFp.downloadDocument(requestParameters.collectionId, requestParameters.documentId, options).then((request) => request(axios, basePath)); + }, + }; +}; + +/** + * DocumentsApi - interface + * @export + * @interface DocumentsApi + */ +export interface DocumentsApiInterface { + /** + * Download the original document file. Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. The file is streamed through the backend to support internal network deployments and maintain access control. **Document Lifecycle and Download Availability:** - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) - EXPIRED: Not downloadable (file deleted by cleanup task) - DELETED: Not downloadable (soft-deleted by user) **Auto-Cleanup Mechanism:** A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. Once confirmed, documents will never be auto-deleted. + * @summary Download document file + * @param {DocumentsApiDownloadDocumentRequest} requestParameters Request parameters. + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DocumentsApiInterface + */ + downloadDocument(requestParameters: DocumentsApiDownloadDocumentRequest, options?: RawAxiosRequestConfig): AxiosPromise; + +} + +/** + * Request parameters for downloadDocument operation in DocumentsApi. + * @export + * @interface DocumentsApiDownloadDocumentRequest + */ +export interface DocumentsApiDownloadDocumentRequest { + /** + * Collection ID + * @type {string} + * @memberof DocumentsApiDownloadDocument + */ + readonly collectionId: string + + /** + * Document ID + * @type {string} + * @memberof DocumentsApiDownloadDocument + */ + readonly documentId: string +} + +/** + * DocumentsApi - object-oriented interface + * @export + * @class DocumentsApi + * @extends {BaseAPI} + */ +export class DocumentsApi extends BaseAPI implements DocumentsApiInterface { + /** + * Download the original document file. Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. The file is streamed through the backend to support internal network deployments and maintain access control. **Document Lifecycle and Download Availability:** - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) - EXPIRED: Not downloadable (file deleted by cleanup task) - DELETED: Not downloadable (soft-deleted by user) **Auto-Cleanup Mechanism:** A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. Once confirmed, documents will never be auto-deleted. + * @summary Download document file + * @param {DocumentsApiDownloadDocumentRequest} requestParameters Request parameters. + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DocumentsApi + */ + public downloadDocument(requestParameters: DocumentsApiDownloadDocumentRequest, options?: RawAxiosRequestConfig) { + return DocumentsApiFp(this.configuration).downloadDocument(requestParameters.collectionId, requestParameters.documentId, options).then((request) => request(this.axios, this.basePath)); + } +} + diff --git a/web/src/api/openapi.merged.yaml b/web/src/api/openapi.merged.yaml index 10c12efe1..b95f43b38 100644 --- a/web/src/api/openapi.merged.yaml +++ b/web/src/api/openapi.merged.yaml @@ -1005,6 +1005,93 @@ paths: application/json: schema: $ref: '#/components/schemas/failResponse' + /collections/{collection_id}/documents/{document_id}/download: + get: + summary: Download document file + description: | + Download the original document file. + Returns the file as a streaming response with appropriate Content-Type and Content-Disposition headers. + The file is streamed through the backend to support internal network deployments and maintain access control. + + **Document Lifecycle and Download Availability:** + - UPLOADED: Downloadable (temporary status, auto-deleted after 24 hours if not confirmed) + - PENDING/RUNNING/COMPLETE/FAILED: Downloadable (permanent, not auto-deleted) + - EXPIRED: Not downloadable (file deleted by cleanup task) + - DELETED: Not downloadable (soft-deleted by user) + + **Auto-Cleanup Mechanism:** + A scheduled task runs every 10 minutes to clean up documents in UPLOADED status that are older than 24 hours. + Once confirmed, documents will never be auto-deleted. + operationId: download_document + tags: + - documents + security: + - BearerAuth: [] + parameters: + - name: collection_id + in: path + required: true + schema: + type: string + description: Collection ID + - name: document_id + in: path + required: true + schema: + type: string + description: Document ID + responses: + '200': + description: Document file stream + content: + application/octet-stream: + schema: + type: string + format: binary + headers: + Content-Type: + description: MIME type of the document (e.g., application/pdf, text/plain) + schema: + type: string + Content-Disposition: + description: Attachment header with original filename + schema: + type: string + example: attachment; filename="document.pdf" + Content-Length: + description: Size of the file in bytes + schema: + type: integer + '400': + description: Bad request - document status does not allow download (EXPIRED or DELETED) + content: + application/json: + schema: + $ref: '#/components/schemas/failResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/failResponse' + '403': + description: Forbidden - user does not have access to this document + content: + application/json: + schema: + $ref: '#/components/schemas/failResponse' + '404': + description: Document not found or file not found in storage + content: + application/json: + schema: + $ref: '#/components/schemas/failResponse' + '500': + description: Internal server error - failed to download from storage + content: + application/json: + schema: + $ref: '#/components/schemas/failResponse' /collections/{collection_id}/documents/upload: post: summary: Upload a single document