Skip to content

Commit 1c816d3

Browse files
authored
Bugfix/clean storage meta data table (ITISFoundation#2415)
* new entrypoint to trigger synchronisation of the table with S3
1 parent d6ea9ea commit 1c816d3

File tree

15 files changed

+442
-23
lines changed

15 files changed

+442
-23
lines changed

api/specs/storage/openapi.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,32 @@ paths:
9797
default:
9898
$ref: "#/components/responses/DefaultErrorResponse"
9999

100+
/locations/{location_id}:sync:
101+
post:
102+
summary: Manually triggers the synchronisation of the file meta data table in the database
103+
operationId: synchronise_meta_data_table
104+
parameters:
105+
- name: location_id
106+
in: path
107+
required: true
108+
schema:
109+
type: string
110+
- name: dry_run
111+
in: query
112+
required: false
113+
schema:
114+
type: boolean
115+
default: true
116+
responses:
117+
"200":
118+
description: An object containing added, changed and removed paths
119+
content:
120+
application/json:
121+
schema:
122+
$ref: "#/components/schemas/TableSynchronisationEnveloped"
123+
default:
124+
$ref: "#/components/responses/DefaultErrorResponse"
125+
100126
/locations/{location_id}/datasets:
101127
get:
102128
summary: Lists all dataset's metadata
@@ -628,6 +654,28 @@ components:
628654
key1: value1
629655
key2: value2
630656

657+
TableSynchronisationEnveloped:
658+
type: object
659+
required:
660+
- data
661+
- error
662+
properties:
663+
data:
664+
$ref: "#/components/schemas/TableSynchronisation"
665+
error:
666+
nullable: true
667+
default: null
668+
669+
TableSynchronisation:
670+
type: object
671+
required:
672+
- removed
673+
properties:
674+
removed:
675+
type: array
676+
items:
677+
type: string
678+
631679
FileLocationArrayEnveloped:
632680
type: object
633681
required:

api/specs/webserver/components/schemas/locations.yaml

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ FileLocationEnveloped:
44
- data
55
properties:
66
data:
7-
$ref: '#/FileLocation'
7+
$ref: "#/FileLocation"
88
error:
99
nullable: true
1010
default: null
@@ -17,10 +17,32 @@ FileLocation:
1717
id:
1818
type: number
1919
example:
20-
filename: 'simcore.s3'
20+
filename: "simcore.s3"
2121
id: 0
2222

2323
FileLocationArray:
2424
type: array
2525
items:
26-
$ref: '#/FileLocation'
26+
$ref: "#/FileLocation"
27+
28+
TableSynchronisationEnveloped:
29+
type: object
30+
required:
31+
- data
32+
- error
33+
properties:
34+
data:
35+
$ref: "#/TableSynchronisation"
36+
error:
37+
nullable: true
38+
default: null
39+
40+
TableSynchronisation:
41+
type: object
42+
required:
43+
- removed
44+
properties:
45+
removed:
46+
type: array
47+
items:
48+
type: string

api/specs/webserver/openapi-storage.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,34 @@ paths:
1515
default:
1616
$ref: "#/components/responses/DefaultErrorResponse"
1717

18+
/storage/locations/{location_id}:sync:
19+
post:
20+
summary: Manually triggers the synchronisation of the file meta data table in the database
21+
tags:
22+
- storage
23+
operationId: synchronise_meta_data_table
24+
parameters:
25+
- name: location_id
26+
in: path
27+
required: true
28+
schema:
29+
type: string
30+
- name: dry_run
31+
in: query
32+
required: false
33+
schema:
34+
type: boolean
35+
default: true
36+
responses:
37+
"200":
38+
description: An object containing added, changed and removed paths
39+
content:
40+
application/json:
41+
schema:
42+
$ref: "./components/schemas/locations.yaml#/TableSynchronisationEnveloped"
43+
default:
44+
$ref: "#/components/responses/DefaultErrorResponse"
45+
1846
/storage/locations/{location_id}/datasets:
1947
get:
2048
summary: Get datasets metadata

api/specs/webserver/openapi.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ paths:
139139
/storage/locations:
140140
$ref: "./openapi-storage.yaml#/paths/~1storage~1locations"
141141

142+
/storage/locations/{location_id}:sync:
143+
$ref: "./openapi-storage.yaml#/paths/~1storage~1locations~1{location_id}:sync"
144+
142145
/storage/locations/{location_id}/files/metadata:
143146
$ref: "./openapi-storage.yaml#/paths/~1storage~1locations~1{location_id}~1files~1metadata"
144147

services/storage/src/simcore_service_storage/api/v0/openapi.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,31 @@ paths:
9595
$ref: '#/components/schemas/FileLocationArrayEnveloped'
9696
default:
9797
$ref: '#/components/responses/DefaultErrorResponse'
98+
'/locations/{location_id}:sync':
99+
post:
100+
summary: Manually triggers the synchronisation of the file meta data table in the database
101+
operationId: synchronise_meta_data_table
102+
parameters:
103+
- name: location_id
104+
in: path
105+
required: true
106+
schema:
107+
type: string
108+
- name: dry_run
109+
in: query
110+
required: false
111+
schema:
112+
type: boolean
113+
default: true
114+
responses:
115+
'200':
116+
description: 'An object containing added, changed and removed paths'
117+
content:
118+
application/json:
119+
schema:
120+
$ref: '#/components/schemas/TableSynchronisationEnveloped'
121+
default:
122+
$ref: '#/components/responses/DefaultErrorResponse'
98123
'/locations/{location_id}/datasets':
99124
get:
100125
summary: Lists all dataset's metadata
@@ -597,6 +622,26 @@ components:
597622
body_value:
598623
key1: value1
599624
key2: value2
625+
TableSynchronisationEnveloped:
626+
type: object
627+
required:
628+
- data
629+
- error
630+
properties:
631+
data:
632+
$ref: '#/components/schemas/TableSynchronisation'
633+
error:
634+
nullable: true
635+
default: null
636+
TableSynchronisation:
637+
type: object
638+
required:
639+
- removed
640+
properties:
641+
removed:
642+
type: array
643+
items:
644+
type: string
600645
FileLocationArrayEnveloped:
601646
type: object
602647
required:

services/storage/src/simcore_service_storage/dsm.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,3 +1009,42 @@ async def create_soft_link(
10091009
result = await conn.execute(stmt)
10101010
link = to_meta_data_extended(await result.first())
10111011
return link
1012+
1013+
async def synchronise_meta_data_table(
1014+
self, location: str, dry_run: bool
1015+
) -> Dict[str, Any]:
1016+
sync_results = {"removed": []}
1017+
if location == SIMCORE_S3_STR:
1018+
# NOTE: only valid for Simcore, since datcore data is not in the database table
1019+
# let's get all the files in the table
1020+
logger.warning(
1021+
"synchronisation of database/s3 storage started, this will take some time..."
1022+
)
1023+
async with self.engine.acquire() as conn, self._create_client_context() as s3_client:
1024+
number_of_rows_in_db = await conn.scalar(file_meta_data.count())
1025+
logger.warning(
1026+
"total number of entries to check %d",
1027+
number_of_rows_in_db,
1028+
)
1029+
1030+
async for row in conn.execute(file_meta_data.select()):
1031+
s3_key = row.object_name # type: ignore
1032+
1033+
# now check if the file exists in S3
1034+
try:
1035+
await s3_client.get_object(
1036+
Bucket=self.simcore_bucket_name,
1037+
Key=s3_key,
1038+
)
1039+
except s3_client.exceptions.NoSuchKey:
1040+
# this file does not exist
1041+
sync_results["removed"].append(s3_key)
1042+
1043+
if not dry_run:
1044+
await conn.execute(
1045+
file_meta_data.delete().where(
1046+
file_meta_data.c.object_name.in_(sync_results["removed"])
1047+
)
1048+
)
1049+
1050+
return sync_results

services/storage/src/simcore_service_storage/handlers.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
import logging
33
from contextlib import contextmanager
4-
from typing import Dict
4+
from typing import Any, Dict
55

66
import attr
77
from aiohttp import web
@@ -219,6 +219,24 @@ async def get_file_metadata(request: web.Request):
219219
}
220220

221221

222+
@routes.post(f"/{api_vtag}/locations/{{location_id}}:sync") # type: ignore
223+
async def synchronise_meta_data_table(request: web.Request) -> Dict[str, Any]:
224+
params, query, *_ = await extract_and_validate(request)
225+
assert query["dry_run"] is not None # nosec
226+
assert params["location_id"] # nosec
227+
228+
with handle_storage_errors():
229+
location_id = params["location_id"]
230+
dry_run = query["dry_run"]
231+
dsm = await _prepare_storage_manager(params, query, request)
232+
location = dsm.location_from_id(location_id)
233+
data_changed: Dict[str, Any] = await dsm.synchronise_meta_data_table(
234+
location, dry_run
235+
)
236+
237+
return {"error": None, "data": data_changed}
238+
239+
222240
# DISABLED: @routes.patch(f"/{api_vtag}/locations/{{location_id}}/files/{{fileId}}/metadata") # type: ignore
223241
async def update_file_meta_data(request: web.Request):
224242
params, query, body = await extract_and_validate(request)

services/storage/tests/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from concurrent.futures import ThreadPoolExecutor
1414
from pathlib import Path
1515
from random import randrange
16-
from typing import Dict, Iterator, Tuple
16+
from typing import Any, Dict, Iterator, Tuple
1717

1818
import dotenv
1919
import pytest
@@ -188,7 +188,7 @@ async def postgres_engine(loop, postgres_service_url):
188188

189189

190190
@pytest.fixture(scope="session")
191-
def minio_service(docker_services, docker_ip):
191+
def minio_service(docker_services, docker_ip) -> Dict[str, Any]:
192192

193193
# Build URL to service listening on random port.
194194
url = "http://%s:%d/" % (
@@ -215,7 +215,7 @@ def minio_service(docker_services, docker_ip):
215215

216216

217217
@pytest.fixture(scope="module")
218-
def s3_client(minio_service):
218+
def s3_client(minio_service: Dict[str, Any]) -> MinioClientWrapper:
219219

220220
s3_client = MinioClientWrapper(
221221
endpoint=minio_service["endpoint"],

services/storage/tests/unit/test_dsm.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
import uuid
1717
from pathlib import Path
1818
from shutil import copyfile
19+
from typing import Any, Dict, Tuple
1920

2021
import attr
2122
import pytest
2223
import tests.utils
2324
from simcore_service_storage.constants import DATCORE_STR, SIMCORE_S3_ID, SIMCORE_S3_STR
25+
from simcore_service_storage.dsm import DataStorageManager
2426
from simcore_service_storage.models import FileMetaData
27+
from simcore_service_storage.s3wrapper.s3_client import MinioClientWrapper
2528
from tests.utils import BUCKET_NAME, USER_ID, has_datcore_tokens
2629

2730

@@ -592,6 +595,46 @@ async def test_dsm_list_datasets_s3(dsm_fixture, dsm_mockup_complete_db):
592595
assert any("Kember" in d.display_name for d in datasets)
593596

594597

598+
async def test_sync_table_meta_data(
599+
dsm_fixture: DataStorageManager,
600+
dsm_mockup_complete_db: Tuple[Dict[str, str], Dict[str, str]],
601+
s3_client: MinioClientWrapper,
602+
):
603+
dsm_fixture.has_project_db = True
604+
605+
expected_removed_files = []
606+
# the list should be empty on start
607+
list_changes: Dict[str, Any] = await dsm_fixture.synchronise_meta_data_table(
608+
location=SIMCORE_S3_STR, dry_run=True
609+
)
610+
assert "removed" in list_changes
611+
assert list_changes["removed"] == expected_removed_files
612+
613+
# now remove the files
614+
for file_entry in dsm_mockup_complete_db:
615+
s3_key = f"{file_entry['project_id']}/{file_entry['node_id']}/{file_entry['filename']}"
616+
s3_client.remove_objects(BUCKET_NAME, [s3_key])
617+
expected_removed_files.append(s3_key)
618+
619+
# the list should now contain the removed entries
620+
list_changes: Dict[str, Any] = await dsm_fixture.synchronise_meta_data_table(
621+
location=SIMCORE_S3_STR, dry_run=True
622+
)
623+
assert "removed" in list_changes
624+
assert list_changes["removed"] == expected_removed_files
625+
626+
# now effectively call the function should really remove the files
627+
list_changes: Dict[str, Any] = await dsm_fixture.synchronise_meta_data_table(
628+
location=SIMCORE_S3_STR, dry_run=False
629+
)
630+
# listing again will show an empty list again
631+
list_changes: Dict[str, Any] = await dsm_fixture.synchronise_meta_data_table(
632+
location=SIMCORE_S3_STR, dry_run=True
633+
)
634+
assert "removed" in list_changes
635+
assert list_changes["removed"] == []
636+
637+
595638
@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens")
596639
async def test_dsm_list_datasets_datcore(dsm_fixture, datcore_structured_testbucket):
597640
datasets = await dsm_fixture.list_datasets(user_id=USER_ID, location=DATCORE_STR)

services/storage/tests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
log = logging.getLogger(__name__)
2020

2121

22-
DATABASE = "aio_login_tests"
22+
DATABASE = "test"
2323
USER = "admin"
2424
PASS = "admin"
2525

0 commit comments

Comments
 (0)