Skip to content

Commit f947d55

Browse files
Register local files (#816)
* Create storage.md * add stubs for uploading local files * file remove and download endpoints * fix pydantic model * give endpoint a distinct name * add important parentheses * run codegen * hide share button if not Minio storage * Hide version history tab * hide version chip if local, include in details * Update DatasetsService.ts * ignore version when downloading metadata for local files * add additional check for v0 * remove logs * merge and code updates * codegen * codegen --------- Co-authored-by: Chen Wang <[email protected]>
1 parent 708f89a commit f947d55

File tree

15 files changed

+419
-94
lines changed

15 files changed

+419
-94
lines changed

backend/app/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ class Settings(BaseSettings):
3636
MINIO_EXPIRES: int = 3600 # seconds
3737
MINIO_SECURE: str = "False" # http vs https
3838

39-
# keycloak server
39+
# Files in the listed directories can be added to Clowder without copying them elsewhere
40+
LOCAL_WHITELIST: List[str] = []
41+
42+
# Keycloak server
4043
auth_base = "http://localhost:8080"
4144
auth_realm = "clowder"
4245
auth_client_id = "clowder2-backend"

backend/app/models/files.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
from enum import Enum
23
from typing import Optional, List
34

45
from beanie import Document, View, PydanticObjectId
@@ -9,6 +10,16 @@
910
from app.models.users import UserOut
1011

1112

13+
class StorageType(str, Enum):
14+
"""Depending on the StorageType,the file may need different properties such as local path or URL.
15+
Also, some StorageTypes do not support versioning or anonymous sharing."""
16+
17+
MINIO = "minio"
18+
LOCAL = "local"
19+
REMOTE = "remote"
20+
AWS = "aws"
21+
22+
1223
class ContentType(BaseModel):
1324
"""This model describes the content type of any type of file(File or Visualization data) uploaded to Clowder. A typical example is "text/plain" for .txt.
1425
In Clowder v1 extractors, "text/*" syntax is acceptable for wildcard matches. To support this, the content type is
@@ -42,6 +53,12 @@ class FileIn(FileBase):
4253
pass
4354

4455

56+
class LocalFileIn(BaseModel):
57+
"""Used when adding a file from a local disk."""
58+
59+
path: str
60+
61+
4562
class FileDB(Document, FileBase):
4663
creator: UserOut
4764
created: datetime = Field(default_factory=datetime.utcnow)
@@ -54,10 +71,16 @@ class FileDB(Document, FileBase):
5471
bytes: int = 0
5572
content_type: ContentType = ContentType()
5673
thumbnail_id: Optional[PydanticObjectId] = None
74+
storage_type: StorageType = StorageType.MINIO
75+
storage_path: Optional[str] # store URL or file path depending on storage_type
5776

5877
class Settings:
5978
name = "files"
6079

80+
class Config:
81+
# required for Enum to properly work
82+
use_enum_values = True
83+
6184

6285
class FileDBViewList(View, FileBase):
6386
id: PydanticObjectId = Field(None, alias="_id") # necessary for Views

backend/app/routers/datasets.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@
4747
DatasetDBViewList,
4848
DatasetStatus,
4949
)
50-
from app.models.files import FileOut, FileDB, FileDBViewList
50+
from app.models.files import FileOut, FileDB, FileDBViewList, LocalFileIn, StorageType
5151
from app.models.folders import FolderOut, FolderIn, FolderDB, FolderDBViewList
5252
from app.models.metadata import MetadataDB
5353
from app.models.pyobjectid import PyObjectId
5454
from app.models.thumbnails import ThumbnailDB
5555
from app.models.users import UserOut
5656
from app.rabbitmq.listeners import submit_dataset_job
5757
from app.routers.authentication import get_admin
58-
from app.routers.files import add_file_entry, remove_file_entry
58+
from app.routers.files import add_file_entry, add_local_file_entry, remove_file_entry
5959
from app.search.connect import (
6060
delete_document_by_id,
6161
)
@@ -446,6 +446,10 @@ async def _delete_nested_folders(parent_folder_id):
446446
raise HTTPException(status_code=404, detail=f"Dataset {dataset_id} not found")
447447

448448

449+
# new endpoint for /{dataset_id}/local_files
450+
# new endpoint for /{dataset_id}/remote_files
451+
452+
449453
@router.post("/{dataset_id}/files", response_model=FileOut)
450454
async def save_file(
451455
dataset_id: str,
@@ -534,6 +538,61 @@ async def save_files(
534538
raise HTTPException(status_code=404, detail=f"Dataset {dataset_id} not found")
535539

536540

541+
@router.post("/{dataset_id}/local_files", response_model=FileOut)
542+
async def save_local_file(
543+
localfile_in: LocalFileIn,
544+
dataset_id: str,
545+
folder_id: Optional[str] = None,
546+
user=Depends(get_current_user),
547+
es=Depends(dependencies.get_elasticsearchclient),
548+
rabbitmq_client: BlockingChannel = Depends(dependencies.get_rabbitmq),
549+
allow: bool = Depends(Authorization("uploader")),
550+
):
551+
if (dataset := await DatasetDB.get(PydanticObjectId(dataset_id))) is not None:
552+
if user is None:
553+
raise HTTPException(
554+
status_code=401, detail=f"User not found. Session might have expired."
555+
)
556+
557+
# Check to make sure file is cleared to proceed
558+
cleared = False
559+
for wpath in settings.LOCAL_WHITELIST:
560+
if localfile_in.path.startswith(wpath):
561+
cleared = True
562+
if not cleared:
563+
raise HTTPException(
564+
status_code=500,
565+
detail=f"File is not located in a whitelisted directory.",
566+
)
567+
568+
(dirname, filename) = os.path.split(localfile_in.path)
569+
new_file = FileDB(
570+
name=filename,
571+
creator=user,
572+
dataset_id=dataset.id,
573+
storage_type=StorageType.LOCAL,
574+
storage_path=localfile_in.path,
575+
bytes=os.path.getsize(localfile_in.path),
576+
)
577+
578+
if folder_id is not None:
579+
if (folder := await FolderDB.get(PydanticObjectId(folder_id))) is not None:
580+
new_file.folder_id = folder.id
581+
else:
582+
raise HTTPException(
583+
status_code=404, detail=f"Folder {folder_id} not found"
584+
)
585+
586+
await add_local_file_entry(
587+
new_file,
588+
user,
589+
es,
590+
rabbitmq_client,
591+
)
592+
return new_file.dict()
593+
raise HTTPException(status_code=404, detail=f"Dataset {dataset_id} not found")
594+
595+
537596
@router.post("/createFromZip", response_model=DatasetOut)
538597
async def create_dataset_from_zip(
539598
user=Depends(get_current_user),

backend/app/routers/files.py

Lines changed: 98 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
File,
1717
UploadFile,
1818
)
19-
from fastapi.responses import StreamingResponse
19+
from fastapi.responses import StreamingResponse, FileResponse
2020
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
2121
from minio import Minio
2222
from pika.adapters.blocking_connection import BlockingChannel
@@ -25,15 +25,10 @@
2525
from app.config import settings
2626
from app.deps.authorization_deps import FileAuthorization
2727
from app.keycloak_auth import get_current_user, get_token
28-
from app.models.files import (
29-
FileOut,
30-
FileVersion,
31-
FileDB,
32-
FileVersionDB,
33-
)
28+
from app.models.files import FileOut, FileVersion, FileDB, FileVersionDB, StorageType
3429
from app.models.metadata import MetadataDB
35-
from app.models.users import UserOut
3630
from app.models.thumbnails import ThumbnailDB
31+
from app.models.users import UserOut
3732
from app.rabbitmq.listeners import submit_file_job, EventListenerJobDB
3833
from app.routers.feeds import check_feed_listeners
3934
from app.routers.utils import get_content_type
@@ -155,17 +150,41 @@ async def add_file_entry(
155150
)
156151

157152

153+
async def add_local_file_entry(
154+
new_file: FileDB,
155+
user: UserOut,
156+
es: Elasticsearch,
157+
rabbitmq_client: BlockingChannel,
158+
content_type: Optional[str] = None,
159+
):
160+
"""Insert FileDB object into MongoDB (makes Clowder ID). Bytes are not stored in DB and versioning not supported
161+
for local files."""
162+
163+
content_type_obj = get_content_type(new_file.name, content_type)
164+
new_file.content_type = content_type_obj
165+
await new_file.insert()
166+
167+
# Add entry to the file index
168+
await index_file(es, FileOut(**new_file.dict()))
169+
170+
# TODO - timing issue here, check_feed_listeners needs to happen asynchronously.
171+
time.sleep(1)
172+
173+
# Submit file job to any qualifying feeds
174+
await check_feed_listeners(
175+
es,
176+
FileOut(**new_file.dict()),
177+
user,
178+
rabbitmq_client,
179+
)
180+
181+
158182
# TODO: Move this to MongoDB middle layer
159183
async def remove_file_entry(
160184
file_id: Union[str, ObjectId], fs: Minio, es: Elasticsearch
161185
):
162186
"""Remove FileDB object into MongoDB, Minio, and associated metadata and version information."""
163187
# TODO: Deleting individual versions will require updating version_id in mongo, or deleting entire document
164-
165-
# Check all connection and abort if any one of them is not available
166-
if fs is None or es is None:
167-
raise HTTPException(status_code=503, detail="Service not available")
168-
return
169188
fs.remove_object(settings.MINIO_BUCKET_NAME, str(file_id))
170189
# delete from elasticsearch
171190
delete_document_by_id(es, settings.elasticsearch_index, str(file_id))
@@ -175,6 +194,18 @@ async def remove_file_entry(
175194
await FileVersionDB.find(FileVersionDB.file_id == ObjectId(file_id)).delete()
176195

177196

197+
async def remove_local_file_entry(file_id: Union[str, ObjectId], es: Elasticsearch):
198+
"""Remove FileDB object into MongoDB, Minio, and associated metadata and version information."""
199+
# TODO: Deleting individual versions will require updating version_id in mongo, or deleting entire document
200+
# delete from elasticsearch
201+
delete_document_by_id(es, settings.elasticsearch_index, str(file_id))
202+
if (file := await FileDB.get(PydanticObjectId(file_id))) is not None:
203+
# TODO: delete from disk - should this be allowed if Clowder didn't originally write the file?
204+
# os.path.remove(file.storage_path)
205+
await file.delete()
206+
await MetadataDB.find(MetadataDB.resource.resource_id == ObjectId(file_id)).delete()
207+
208+
178209
@router.put("/{file_id}", response_model=FileOut)
179210
async def update_file(
180211
file_id: str,
@@ -275,33 +306,53 @@ async def download_file(
275306
):
276307
# If file exists in MongoDB, download from Minio
277308
if (file := await FileDB.get(PydanticObjectId(file_id))) is not None:
278-
if version is not None:
279-
# Version is specified, so get the minio ID from versions table if possible
280-
file_vers = await FileVersionDB.find_one(
281-
FileVersionDB.file_id == ObjectId(file_id),
282-
FileVersionDB.version_num == version,
283-
)
284-
if file_vers is not None:
285-
vers = FileVersion(**file_vers.dict())
286-
content = fs.get_object(
287-
settings.MINIO_BUCKET_NAME, file_id, version_id=vers.version_id
309+
if file.storage_type == StorageType.MINIO:
310+
if version is not None:
311+
# Version is specified, so get the minio ID from versions table if possible
312+
file_vers = await FileVersionDB.find_one(
313+
FileVersionDB.file_id == ObjectId(file_id),
314+
FileVersionDB.version_num == version,
288315
)
316+
if file_vers is not None:
317+
vers = FileVersion(**file_vers.dict())
318+
content = fs.get_object(
319+
settings.MINIO_BUCKET_NAME, file_id, version_id=vers.version_id
320+
)
321+
else:
322+
raise HTTPException(
323+
status_code=404,
324+
detail=f"File {file_id} version {version} not found",
325+
)
289326
else:
290-
raise HTTPException(
291-
status_code=404,
292-
detail=f"File {file_id} version {version} not found",
293-
)
327+
# If no version specified, get latest version directly
328+
content = fs.get_object(settings.MINIO_BUCKET_NAME, file_id)
329+
330+
# Get content type & open file stream
331+
response = StreamingResponse(
332+
content.stream(settings.MINIO_UPLOAD_CHUNK_SIZE)
333+
)
334+
response.headers["Content-Disposition"] = (
335+
"attachment; filename=%s" % file.name
336+
)
337+
338+
elif file.storage_type == StorageType.LOCAL:
339+
response = FileResponse(
340+
path=file.storage_path,
341+
filename=file.name,
342+
media_type=file.content_type.content_type,
343+
)
344+
294345
else:
295-
# If no version specified, get latest version directly
296-
content = fs.get_object(settings.MINIO_BUCKET_NAME, file_id)
346+
raise HTTPException(
347+
status_code=400, detail=f"Unable to download {file_id}."
348+
)
349+
350+
if response:
351+
if increment:
352+
# Increment download count
353+
await file.update(Inc({FileDB.downloads: 1}))
354+
return response
297355

298-
# Get content type & open file stream
299-
response = StreamingResponse(content.stream(settings.MINIO_UPLOAD_CHUNK_SIZE))
300-
response.headers["Content-Disposition"] = "attachment; filename=%s" % file.name
301-
if increment:
302-
# Increment download count
303-
await file.update(Inc({FileDB.downloads: 1}))
304-
return response
305356
else:
306357
raise HTTPException(status_code=404, detail=f"File {file_id} not found")
307358

@@ -365,8 +416,11 @@ async def delete_file(
365416
es: Elasticsearch = Depends(dependencies.get_elasticsearchclient),
366417
allow: bool = Depends(FileAuthorization("editor")),
367418
):
368-
if (await FileDB.get(PydanticObjectId(file_id))) is not None:
369-
await remove_file_entry(file_id, fs, es)
419+
if (file := await FileDB.get(PydanticObjectId(file_id))) is not None:
420+
if file.storage_type == StorageType.LOCAL:
421+
await remove_local_file_entry(file_id, es)
422+
else:
423+
await remove_file_entry(file_id, fs, es)
370424
return {"deleted": file_id}
371425
else:
372426
raise HTTPException(status_code=404, detail=f"File {file_id} not found")
@@ -415,13 +469,13 @@ async def get_file_versions(
415469
limit: int = 20,
416470
allow: bool = Depends(FileAuthorization("viewer")),
417471
):
418-
file = await FileDB.get(PydanticObjectId(file_id))
419-
if file is not None:
472+
if (file := await FileDB.get(PydanticObjectId(file_id))) is not None:
420473
mongo_versions = []
421-
async for ver in FileVersionDB.find(
422-
FileVersionDB.file_id == ObjectId(file_id)
423-
).sort(-FileVersionDB.created).skip(skip).limit(limit):
424-
mongo_versions.append(FileVersion(**ver.dict()))
474+
if file.storage_type == StorageType.MINIO:
475+
async for ver in FileVersionDB.find(
476+
FileVersionDB.file_id == ObjectId(file_id)
477+
).sort(-FileVersionDB.created).skip(skip).limit(limit):
478+
mongo_versions.append(FileVersion(**ver.dict()))
425479
return mongo_versions
426480

427481
raise HTTPException(status_code=404, detail=f"File {file_id} not found")

backend/app/routers/metadata_files.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ async def _build_metadata_db_obj(
5454
if version is None:
5555
# Validate specified version, or use latest by default
5656
file_version = metadata_in.file_version
57-
if file_version is not None:
57+
if file_version is not None and file_version > 0:
5858
if (
5959
await FileVersionDB.find_one(
6060
FileVersionDB.file_id == file.id,
@@ -117,10 +117,10 @@ async def add_file_metadata(
117117
"""
118118
if (file := await FileDB.get(PydanticObjectId(file_id))) is not None:
119119
current_file_version = file.version_num
120-
# if metadata does not already specify a file version
121-
# change metadata_in file version to match the current file version
120+
# if metadata does not already specify a version, change metadata_in version to match current file version
122121
if metadata_in.file_version is None:
123122
metadata_in.file_version = current_file_version
123+
124124
# If dataset already has metadata using this definition, don't allow duplication
125125
definition = metadata_in.definition
126126
if definition is not None:
@@ -340,7 +340,7 @@ async def get_file_metadata(
340340

341341
# Validate specified version, or use latest by default
342342
if not all_versions:
343-
if version is not None:
343+
if version is not None and version > 0:
344344
if (
345345
await FileVersionDB.find_one(
346346
FileVersionDB.file_id == ObjectId(file_id),

backend/app/routers/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def get_content_type(
88
filename: str,
99
content_type: Optional[str] = None,
1010
):
11-
"""Returns ContentType object given a content_tyoe, also guessed the content_type if none is provided
11+
"""Returns ContentType object given a content_type, also guessed the content_type if none is provided
1212
1313
Arguments:
1414
content_type: content_type of a file to be uploaded

0 commit comments

Comments
 (0)