Skip to content

Commit 6c3da0b

Browse files
feat: add S3 search_files method
1 parent 8e85054 commit 6c3da0b

File tree

2 files changed

+257
-1
lines changed

2 files changed

+257
-1
lines changed

services/storage/src/simcore_service_storage/simcore_s3_dsm.py

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import contextlib
22
import datetime
3+
import fnmatch
34
import logging
45
import tempfile
56
import urllib.parse
6-
from collections.abc import Coroutine
7+
from collections.abc import AsyncGenerator, Coroutine
78
from contextlib import suppress
89
from dataclasses import dataclass
910
from pathlib import Path
@@ -959,6 +960,136 @@ async def search_owned_files(
959960
resolved_fmds.append(convert_db_to_model(updated_fmd))
960961
return resolved_fmds
961962

963+
def _create_file_metadata_from_s3_object(
964+
self, s3_obj: S3MetaData, user_id: UserID
965+
) -> FileMetaData | None:
966+
"""Create FileMetaData from S3 object, return None if invalid."""
967+
try:
968+
return FileMetaData.from_simcore_node(
969+
user_id=user_id,
970+
file_id=TypeAdapter(SimcoreS3FileID).validate_python(s3_obj.object_key),
971+
bucket=self.simcore_bucket_name,
972+
location_id=self.get_location_id(),
973+
location_name=self.get_location_name(),
974+
sha256_checksum=None,
975+
file_size=s3_obj.size,
976+
last_modified=s3_obj.last_modified,
977+
entity_tag=s3_obj.e_tag,
978+
)
979+
except (ValidationError, ValueError):
980+
return None
981+
982+
async def _process_s3_page_results(
983+
self,
984+
current_page_results: list[FileMetaData],
985+
) -> list[FileMetaData]:
986+
current_page_results.sort(
987+
key=lambda x: x.last_modified
988+
or datetime.datetime.min.replace(tzinfo=datetime.UTC),
989+
reverse=True,
990+
)
991+
992+
result_project_ids = list(
993+
{
994+
result.project_id
995+
for result in current_page_results
996+
if result.project_id is not None
997+
}
998+
)
999+
1000+
if result_project_ids:
1001+
current_page_results = await _add_frontend_needed_data(
1002+
get_db_engine(self.app),
1003+
project_ids=result_project_ids,
1004+
data=current_page_results,
1005+
)
1006+
1007+
return current_page_results
1008+
1009+
async def _search_project_s3_files(
1010+
self,
1011+
proj_id: ProjectID,
1012+
filename_pattern: str,
1013+
user_id: UserID,
1014+
items_per_page: NonNegativeInt,
1015+
) -> AsyncGenerator[list[FileMetaData], None]:
1016+
"""Search S3 files in a specific project and yield results page by page."""
1017+
s3_client = get_s3_client(self.app)
1018+
min_parts_for_valid_s3_object = 2
1019+
current_page_results: list[FileMetaData] = []
1020+
1021+
try:
1022+
async for s3_objects in s3_client.list_objects_paginated(
1023+
bucket=self.simcore_bucket_name,
1024+
prefix=f"{proj_id}/",
1025+
items_per_page=items_per_page * 2,
1026+
):
1027+
for s3_obj in s3_objects:
1028+
filename = Path(s3_obj.object_key).name
1029+
1030+
if (
1031+
fnmatch.fnmatch(filename, filename_pattern)
1032+
and len(s3_obj.object_key.split("/"))
1033+
>= min_parts_for_valid_s3_object
1034+
):
1035+
file_meta = self._create_file_metadata_from_s3_object(
1036+
s3_obj, user_id
1037+
)
1038+
if file_meta:
1039+
current_page_results.append(file_meta)
1040+
1041+
if len(current_page_results) >= items_per_page:
1042+
processed_results = await self._process_s3_page_results(
1043+
current_page_results
1044+
)
1045+
yield processed_results
1046+
current_page_results = []
1047+
1048+
if current_page_results:
1049+
processed_results = await self._process_s3_page_results(
1050+
current_page_results
1051+
)
1052+
yield processed_results
1053+
1054+
except S3KeyNotFoundError:
1055+
with log_context(
1056+
_logger, logging.DEBUG, f"Failed to search S3 for project {proj_id}"
1057+
):
1058+
return
1059+
1060+
async def search_files(
1061+
self,
1062+
user_id: UserID,
1063+
*,
1064+
filename_pattern: str,
1065+
project_id: ProjectID | None = None,
1066+
items_per_page: NonNegativeInt = 100,
1067+
) -> AsyncGenerator[list[FileMetaData], None]:
1068+
"""
1069+
Search for files in S3 using a wildcard pattern for filenames.
1070+
Returns results as an async generator that yields pages of results.
1071+
1072+
Args:
1073+
user_id: The user requesting the search
1074+
filename_pattern: Wildcard pattern for filename matching (e.g., "*.txt", "test_*.json")
1075+
project_id: Optional project ID to limit search to specific project
1076+
items_per_page: Number of items to return per page
1077+
1078+
Yields:
1079+
List of FileMetaData objects for each page
1080+
"""
1081+
# Validate access rights
1082+
accessible_projects_ids = await get_accessible_project_ids(
1083+
get_db_engine(self.app), user_id=user_id, project_id=project_id
1084+
)
1085+
1086+
# Search each accessible project
1087+
for proj_id in accessible_projects_ids:
1088+
async for page_results in self._search_project_s3_files(
1089+
proj_id, filename_pattern, user_id, items_per_page
1090+
):
1091+
yield page_results
1092+
9621093
async def create_soft_link(
9631094
self, user_id: int, target_file_id: StorageFileID, link_file_id: StorageFileID
9641095
) -> FileMetaData:

services/storage/tests/unit/test_simcore_s3_dsm.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,131 @@ async def test_upload_and_search(
168168
assert file.file_name in {"file1", "file2"}
169169

170170

171+
@pytest.mark.parametrize(
172+
"location_id",
173+
[SimcoreS3DataManager.get_location_id()],
174+
ids=[SimcoreS3DataManager.get_location_name()],
175+
indirect=True,
176+
)
177+
async def test_search_files(
178+
simcore_s3_dsm: SimcoreS3DataManager,
179+
upload_file: Callable[..., Awaitable[tuple[Path, SimcoreS3FileID]]],
180+
file_size: ByteSize,
181+
user_id: UserID,
182+
project_id: ProjectID,
183+
faker: Faker,
184+
):
185+
# Upload files with different patterns
186+
test_files = [
187+
("test_file1.txt", "*.txt"),
188+
("test_file2.txt", "*.txt"),
189+
("document.pdf", "*.pdf"),
190+
("data_file.csv", "data_*.csv"),
191+
("backup_file.bak", "backup_*"),
192+
("config.json", "*.json"),
193+
("temp_data.tmp", "temp_*"),
194+
]
195+
196+
uploaded_files = []
197+
for file_name, _ in test_files:
198+
checksum: SHA256Str = TypeAdapter(SHA256Str).validate_python(faker.sha256())
199+
_, file_id = await upload_file(file_size, file_name, sha256_checksum=checksum)
200+
uploaded_files.append((file_name, file_id, checksum))
201+
202+
# Test 1: Search for all .txt files
203+
txt_results = []
204+
async for page in simcore_s3_dsm.search_files(
205+
user_id=user_id,
206+
filename_pattern="*.txt",
207+
project_id=project_id,
208+
items_per_page=10,
209+
):
210+
txt_results.extend(page)
211+
212+
# Should find 2 txt files
213+
assert len(txt_results) == 2
214+
txt_names = {file.file_name for file in txt_results}
215+
assert txt_names == {"test_file1.txt", "test_file2.txt"}
216+
217+
# Test 2: Search with specific prefix pattern
218+
data_results = []
219+
async for page in simcore_s3_dsm.search_files(
220+
user_id=user_id,
221+
filename_pattern="data_*",
222+
project_id=project_id,
223+
items_per_page=10,
224+
):
225+
data_results.extend(page)
226+
227+
# Should find 1 data file
228+
assert len(data_results) == 1
229+
assert data_results[0].file_name == "data_file.csv"
230+
231+
# Test 3: Search with pattern that matches multiple extensions
232+
temp_results = []
233+
async for page in simcore_s3_dsm.search_files(
234+
user_id=user_id,
235+
filename_pattern="temp_*",
236+
project_id=project_id,
237+
items_per_page=10,
238+
):
239+
temp_results.extend(page)
240+
241+
# Should find 1 temp file
242+
assert len(temp_results) == 1
243+
assert temp_results[0].file_name == "temp_data.tmp"
244+
245+
# Test 4: Search with pattern that doesn't match anything
246+
no_match_results = []
247+
async for page in simcore_s3_dsm.search_files(
248+
user_id=user_id,
249+
filename_pattern="nonexistent_*",
250+
project_id=project_id,
251+
items_per_page=10,
252+
):
253+
no_match_results.extend(page)
254+
255+
assert len(no_match_results) == 0
256+
257+
# Test 5: Search without project_id restriction (all accessible projects)
258+
all_results = []
259+
async for page in simcore_s3_dsm.search_files(
260+
user_id=user_id,
261+
filename_pattern="*",
262+
items_per_page=10,
263+
):
264+
all_results.extend(page)
265+
266+
# Should find at least our uploaded files
267+
assert len(all_results) >= len(test_files)
268+
269+
# Verify that each result has expected FileMetaData structure
270+
for file_meta in all_results:
271+
assert isinstance(file_meta, FileMetaData)
272+
assert file_meta.file_name is not None
273+
assert file_meta.file_id is not None
274+
assert file_meta.user_id == user_id
275+
assert file_meta.project_id is not None
276+
277+
# Test 6: Test pagination with small page size
278+
paginated_results = []
279+
page_count = 0
280+
async for page in simcore_s3_dsm.search_files(
281+
user_id=user_id,
282+
filename_pattern="*",
283+
project_id=project_id,
284+
items_per_page=2, # Small page size to test pagination
285+
):
286+
paginated_results.extend(page)
287+
page_count += 1
288+
# Each page should have at most 2 items
289+
assert len(page) <= 2
290+
291+
# Should have multiple pages and all our files
292+
assert page_count >= 4 # At least 7 files / 2 per page = 4 pages
293+
assert len(paginated_results) == len(test_files)
294+
295+
171296
@pytest.fixture
172297
async def paths_for_export(
173298
random_project_with_files: Callable[

0 commit comments

Comments
 (0)