diff --git a/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py b/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py index 6b1c0d1da109..fa7f4050e17e 100644 --- a/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py +++ b/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py @@ -2,7 +2,7 @@ import datetime from collections.abc import AsyncGenerator from pathlib import Path -from typing import TypeAlias +from typing import Annotated, TypeAlias import sqlalchemy as sa from models_library.basic_types import SHA256Str @@ -10,7 +10,7 @@ from models_library.projects_nodes_io import NodeID, SimcoreS3FileID from models_library.users import UserID from models_library.utils.fastapi_encoders import jsonable_encoder -from pydantic import BaseModel +from pydantic import BaseModel, Field, validate_call from simcore_postgres_database.storage_models import file_meta_data from simcore_postgres_database.utils_repos import ( pass_or_acquire_connection, @@ -35,16 +35,15 @@ class _PathsCursorParameters(BaseModel): + # NOTE: this is a cursor do not put things that can grow unbounded as this goes then through REST APIs or such offset: int file_prefix: Path | None - project_ids: list[ProjectID] | None partial: bool def _init_pagination( cursor: GenericCursor | None, *, - filter_by_project_ids: list[ProjectID] | None, filter_by_file_prefix: Path | None, is_partial_prefix: bool, ) -> _PathsCursorParameters: @@ -53,7 +52,6 @@ def _init_pagination( return _PathsCursorParameters( offset=0, file_prefix=filter_by_file_prefix, - project_ids=filter_by_project_ids, partial=is_partial_prefix, ) @@ -229,11 +227,14 @@ async def try_get_directory( return None return None + @validate_call(config={"arbitrary_types_allowed": True}) async def list_child_paths( self, *, connection: AsyncConnection | None = None, - filter_by_project_ids: list[ProjectID] | None, + filter_by_project_ids: Annotated[ + list[ProjectID] | None, Field(max_length=10000) + ], filter_by_file_prefix: Path | None, cursor: GenericCursor | None, limit: int, @@ -241,11 +242,13 @@ async def list_child_paths( ) -> tuple[list[PathMetaData], GenericCursor | None, TotalChildren]: """returns a list of FileMetaDataAtDB that are one level deep. e.g. when no filter is used, these are top level objects + + NOTE: if filter_by_project_ids is huge, this will raise ValidationError and someone needs to fix it! + Maybe using a DB join """ cursor_params = _init_pagination( cursor, - filter_by_project_ids=filter_by_project_ids, filter_by_file_prefix=filter_by_file_prefix, is_partial_prefix=is_partial_prefix, ) @@ -278,9 +281,9 @@ async def list_child_paths( file_meta_data.c.file_id.like(search_prefix), ( file_meta_data.c.project_id.in_( - [f"{_}" for _ in cursor_params.project_ids] + [f"{_}" for _ in filter_by_project_ids] ) - if cursor_params.project_ids + if filter_by_project_ids else True ), ) @@ -303,9 +306,9 @@ async def list_child_paths( ) .where( file_meta_data.c.project_id.in_( - [f"{_}" for _ in cursor_params.project_ids] + [f"{_}" for _ in filter_by_project_ids] ) - if cursor_params.project_ids + if filter_by_project_ids else True ) .cte("ranked_files") diff --git a/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py b/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py index d87d528bcb90..0d3150e10894 100644 --- a/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py +++ b/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py @@ -1,3 +1,4 @@ +import logging from contextlib import suppress from pathlib import Path from typing import TypeAlias @@ -31,6 +32,8 @@ from ..modules.db.projects import ProjectRepository from .utils import convert_db_to_model +_logger = logging.getLogger(__name__) + async def _list_all_files_in_folder( *, @@ -250,6 +253,11 @@ async def list_child_paths_from_s3( """ objects_cursor = None if cursor is not None: + _logger.debug( + "Using cursor for listing child paths in S3 for filter '%s': %s", + file_filter, + cursor, + ) cursor_params = json_loads(cursor) assert cursor_params["file_filter"] == f"{file_filter}" # nosec objects_cursor = cursor_params["objects_next_cursor"] @@ -277,6 +285,11 @@ async def list_child_paths_from_s3( ] next_cursor = None if objects_next_cursor: + _logger.debug( + "Next cursor for listing child paths in S3 for filter '%s': %s", + file_filter, + objects_next_cursor, + ) next_cursor = json_dumps( { "file_filter": f"{file_filter}",