Skip to content

Commit 2a52c7c

Browse files
authored
Migrate metadata (#1192)
* dataset metadata is working * register migration extractor and successfully migrate machine metadata
1 parent 86c8493 commit 2a52c7c

File tree

3 files changed

+230
-56
lines changed

3 files changed

+230
-56
lines changed

backend/app/models/metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"int": int,
2121
"float": float,
2222
"str": str,
23+
"string": str,
2324
"TextField": str,
2425
"bool": bool,
2526
# TODO figure out how to parse "yyyymmdd hh:mm:ssssssz" into datetime object

scripts/migration/migrate.py

Lines changed: 208 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
import requests
55
from dotenv import dotenv_values
66

7+
from scripts.migration.migrate_metadata_definitions import (
8+
check_metadata_definition_exists,
9+
get_clowder_v1_metadata_definitions,
10+
post_metadata_definition,
11+
)
12+
713
# Configuration and Constants
814
DEFAULT_PASSWORD = "Password123&"
915

@@ -63,33 +69,33 @@ def generate_user_api_key(user, password=DEFAULT_PASSWORD):
6369

6470
def get_clowder_v1_users():
6571
"""Retrieve all users from Clowder v1."""
66-
endpoint = f"{CLOWDER_V1}/api/users"
72+
endpoint = f"{CLOWDER_V1}/api/users?superAdmin=true"
6773
response = requests.get(endpoint, headers=base_headers_v1, verify=False)
6874
return response.json()
6975

7076

7177
def get_clowder_v1_user_datasets(user_id):
7278
"""Retrieve datasets created by a specific user in Clowder v1."""
7379
# TODO what about pagination
74-
endpoint = f"{CLOWDER_V1}/api/datasets?limit=0"
80+
endpoint = f"{CLOWDER_V1}/api/datasets?limit=0&superAdmin=true"
7581
response = requests.get(endpoint, headers=clowder_headers_v1, verify=False)
7682
return [dataset for dataset in response.json() if dataset["authorId"] == user_id]
7783

7884

7985
def get_clowder_v1_user_spaces(user_v1):
80-
endpoint = f"{CLOWDER_V1}/api/spaces"
86+
endpoint = f"{CLOWDER_V1}/api/spaces?superAdmin=true"
8187
response = requests.get(endpoint, headers=clowder_headers_v1, verify=False)
8288
return [space for space in response.json() if space["creator"] == user_v1["id"]]
8389

8490

8591
def get_clowder_v1_user_spaces_members(space_id):
86-
endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}/users"
92+
endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}/users?superAdmin=true"
8793
response = requests.get(endpoint, headers=clowder_headers_v1, verify=False)
8894
return response.json()
8995

9096

9197
def get_clowder_v2_space_datasets(space_id):
92-
endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}/datasets"
98+
endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}/datasets?superAdmin=true"
9399
response = requests.get(endpoint, headers=clowder_headers_v1, verify=False)
94100
return response.json()
95101

@@ -265,32 +271,180 @@ def download_and_upload_file(file, all_dataset_folders, dataset_v2_id, headers_v
265271
dataset_file_upload_endpoint = f"{CLOWDER_V2}/api/v2/datasets/{dataset_v2_id}/files"
266272
if matching_folder:
267273
dataset_file_upload_endpoint += f"Multiple?folder_id={matching_folder['id']}"
268-
file_exists = os.path.exists(filename)
269-
# with open(filename, "rb") as file_data:
270274
response = requests.post(
271-
dataset_file_upload_endpoint, headers=headers_v2, files={"file": open(filename, "rb")}
275+
dataset_file_upload_endpoint,
276+
headers=headers_v2,
277+
files={"file": open(filename, "rb")},
272278
)
273279

274-
if response.status_code == 200:
275-
print(f"Uploaded file: {filename} to dataset {dataset_v2_id}")
276-
277280
# Clean up the local file after upload
278281
try:
279282
os.remove(filename)
280283
except Exception as e:
281284
print(f"Could not delete locally downloaded file: {filename}")
282285
print(e)
283-
print(f"Completed upload for file: {filename}")
286+
287+
if response.status_code == 200:
288+
print(f"Uploaded file: {filename} to dataset {dataset_v2_id}")
289+
return response.json().get("id")
290+
else:
291+
print(f"Failed to upload file: {filename} to dataset {dataset_v2_id}")
292+
293+
return None
294+
295+
296+
def add_file_metadata(file_v1, file_v2_id, headers_v1, headers_v2):
297+
# Get metadata from Clowder V1
298+
endpoint = f"{CLOWDER_V1}/api/files/{file_v1['id']}/metadata.jsonld?superAdmin=true"
299+
metadata_v1 = requests.get(endpoint, headers=headers_v1).json()
300+
301+
# Iterate through the metadata and post it to Clowder V2
302+
for metadata in metadata_v1:
303+
# Extract and map each key-value pair from the metadata's content
304+
if "content" in metadata:
305+
for key, value in metadata["content"].items():
306+
# Define the payload to send to V2
307+
metadata_payload_v2 = {
308+
"definition": key,
309+
"content": metadata["content"],
310+
}
311+
312+
# Check if the metadata definition exists;
313+
# if exists, post to user metadat; otherwise, post to machine metadata
314+
v2_metadata_endpoint = (
315+
f"{CLOWDER_V2}/api/v2/files/{file_v2_id}/metadata"
316+
)
317+
if check_metadata_definition_exists(
318+
CLOWDER_V2, key, headers=headers_v2
319+
):
320+
response = requests.post(
321+
v2_metadata_endpoint,
322+
json=metadata_payload_v2,
323+
headers=headers_v2,
324+
)
325+
326+
if response.status_code != 200:
327+
print(f"Failed to post file metadata to V2: {response.text}")
328+
else:
329+
print(f"Successfully posted file metadata to V2: {key}")
330+
else:
331+
if "agent" in metadata and "listener" not in metadata:
332+
metadata["listener"] = {
333+
"name": "migration",
334+
"version": "1",
335+
"description": "Migration of metadata from Clowder v1 to Clowder v2",
336+
}
337+
response = requests.post(
338+
v2_metadata_endpoint, json=metadata, headers=headers_v2
339+
)
340+
341+
if response.status_code != 200:
342+
print(f"Failed to post file metadata to V2: {response.text}")
343+
else:
344+
print("Successfully posted file machine metadata to V2")
345+
break # machine metadata no need to iterate through all the keys
346+
347+
348+
def add_dataset_metadata(dataset_v1, dataset_v2_id, headers_v1, headers_v2):
349+
# Get metadata from Clowder V1
350+
endpoint = (
351+
f"{CLOWDER_V1}/api/datasets/{dataset_v1['id']}/metadata.jsonld?superAdmin=true"
352+
)
353+
metadata_v1 = requests.get(endpoint, headers=headers_v1).json()
354+
355+
# Iterate through the metadata and post it to Clowder V2
356+
for metadata in metadata_v1:
357+
# Extract and map each key-value pair from the metadata's content
358+
if "content" in metadata:
359+
for key, value in metadata["content"].items():
360+
# Define the payload to send to V2
361+
metadata_payload_v2 = {
362+
"definition": key,
363+
"content": metadata["content"],
364+
}
365+
366+
# Check if the metadata definition exists;
367+
# if exists, post to user metadat; otherwise, post to machine metadata
368+
v2_metadata_endpoint = (
369+
f"{CLOWDER_V2}/api/v2/datasets/{dataset_v2_id}/metadata"
370+
)
371+
if check_metadata_definition_exists(
372+
CLOWDER_V2, key, headers=headers_v2
373+
):
374+
response = requests.post(
375+
v2_metadata_endpoint,
376+
json=metadata_payload_v2,
377+
headers=headers_v2,
378+
)
379+
380+
if response.status_code != 200:
381+
print(f"Failed to post dataset metadata to V2: {response.text}")
382+
else:
383+
print(f"Successfully posted dataset metadata to V2: {key}")
384+
else:
385+
if "agent" in metadata and "listener" not in metadata:
386+
metadata["listener"] = {
387+
"name": "migration",
388+
"version": "1",
389+
"description": "Migration of metadata from Clowder v1 to Clowder v2",
390+
}
391+
response = requests.post(
392+
v2_metadata_endpoint, json=metadata, headers=headers_v2
393+
)
394+
395+
if response.status_code != 200:
396+
print(f"Failed to post dataset metadata to V2: {response.text}")
397+
else:
398+
print("Successfully posted dataset machine metadata to V2")
399+
break # machine metadata no need to iterate through all the keys
400+
401+
402+
def register_migration_extractor():
403+
"""Register the migration extractor in Clowder v2."""
404+
migration_extractor = {
405+
"name": "migration",
406+
"description": "Migration of metadata from Clowder v1 to Clowder v2",
407+
"version": "1",
408+
"author": "Clowder Devs",
409+
}
410+
411+
# check if migration extractor already exists
412+
search_endpoint = f"{CLOWDER_V2}/api/v2/listeners/search"
413+
search_params = {"text": migration_extractor["name"]}
414+
search_response = requests.get(
415+
search_endpoint, headers=clowder_headers_v2, params=search_params
416+
)
417+
418+
# Check if extractor already exists
419+
if search_response.status_code == 200:
420+
search_data = search_response.json()
421+
if search_data.get("metadata", {}).get("total_count", 0) > 0:
422+
for existing_extractor in search_response.json().get("data", []):
423+
if existing_extractor.get("name") == migration_extractor["name"]:
424+
print(
425+
f"Extractor {migration_extractor['name']} already exists in Clowder v2."
426+
)
427+
return
428+
429+
endpoint = f"{CLOWDER_V2}/api/v2/extractors"
430+
response = requests.post(
431+
endpoint, json=migration_extractor, headers=clowder_headers_v2
432+
)
433+
434+
if response.status_code == 200:
435+
print("Successfully registered migration extractor in Clowder v2.")
436+
else:
437+
print(
438+
f"Failed to register migration extractor in Clowder v2. Status code: {response.status_code}"
439+
)
284440

285441

286442
def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
287443
"""Process user resources from Clowder v1 to Clowder v2."""
288444
user_v1_datasets = get_clowder_v1_user_datasets(user_id=user_v1["id"])
289445
user_v2_api_key = create_local_user(user_v1)
290446
USER_MAP[user_v1["id"]] = user_v2_api_key
291-
base_user_headers_v2 = {
292-
"x-api-key": user_v2_api_key
293-
}
447+
base_user_headers_v2 = {"x-api-key": user_v2_api_key}
294448
user_headers_v2 = {
295449
"x-api-key": user_v2_api_key,
296450
"content-type": "application/json",
@@ -301,6 +455,7 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
301455
print(f"Creating dataset in v2: {dataset['id']} - {dataset['name']}")
302456
dataset_v2_id = create_v2_dataset(dataset, user_headers_v2)
303457
DATASET_MAP[dataset["id"]] = dataset_v2_id
458+
add_dataset_metadata(dataset, dataset_v2_id, base_headers_v1, user_headers_v2)
304459
add_dataset_folders(dataset, dataset_v2_id, user_headers_v2)
305460
print("Created folders in the new dataset")
306461

@@ -316,34 +471,33 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
316471
files_result = files_response.json()
317472

318473
for file in files_result:
319-
download_and_upload_file(
474+
file_v2_id = download_and_upload_file(
320475
file, all_dataset_folders, dataset_v2_id, base_user_headers_v2
321476
)
477+
if file_v2_id is not None:
478+
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
479+
322480
return [USER_MAP, DATASET_MAP]
323481

324482

325483
if __name__ == "__main__":
326-
# users_v1 = get_clowder_v1_users()
484+
##############################################################################################################
485+
# migrate metadata definition
486+
v1_md_definitions = get_clowder_v1_metadata_definitions(CLOWDER_V1, base_headers_v1)
487+
posted_ids = []
488+
for v1_md in v1_md_definitions:
489+
definition_id = post_metadata_definition(v1_md, CLOWDER_V2, clowder_headers_v2)
490+
if definition_id:
491+
posted_ids.append(definition_id)
492+
493+
##############################################################################################################
494+
# Register the migration extractor in Clowder v2
495+
register_migration_extractor()
496+
497+
##############################################################################################################
498+
# migrate users and resources
327499
USER_MAP = {}
328500
DATASET_MAP = {}
329-
users_v1 = [
330-
{
331-
"@context": {
332-
"firstName": "http://schema.org/Person/givenName",
333-
"lastName": "http://schema.org/Person/familyName",
334-
"email": "http://schema.org/Person/email",
335-
"affiliation": "http://schema.org/Person/affiliation",
336-
},
337-
"id": "576313ce1407b25fe19fc381",
338-
"firstName": "Chen",
339-
"lastName": "Wang",
340-
"fullName": "Chen Wang",
341-
"email": "[email protected]",
342-
"avatar": "http://www.gravatar.com/avatar/2f97a52f2214949c4172d7fb796f173e?d=404",
343-
"profile": {},
344-
"identityProvider": "Chen Wang ([email protected]) [Local Account]",
345-
}
346-
]
347501
users_v1 = get_clowder_v1_users()
348502
for user_v1 in users_v1:
349503
if (
@@ -357,21 +511,23 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
357511
else:
358512
print(f"Skipping user {user_v1['email']} as it is not a local account.")
359513

360-
print("Now migrating spaces.")
361-
for user_v1 in users_v1:
362-
print(f"Migrating spaces of user {user_v1['email']}")
363-
user_v1_spaces = get_clowder_v1_user_spaces(user_v1)
364-
user_v2_api_key = USER_MAP[user_v1["id"]]
365-
for space in user_v1_spaces:
366-
group_id = create_v2_group(space, headers={"X-API-key": user_v2_api_key})
367-
add_v1_space_members_to_v2_group(
368-
space, group_id, headers={"X-API-key": user_v2_api_key}
369-
)
370-
space_datasets = get_clowder_v2_space_datasets(space["id"])
371-
for space_dataset in space_datasets:
372-
dataset_v2_id = DATASET_MAP[space_dataset["id"]]
373-
share_dataset_with_group(
374-
group_id, space, headers={"X-API-key": user_v2_api_key}
375-
)
376-
print(f"Migrated spaces of user {user_v1['email']}")
514+
##############################################################################################################
515+
# migrate spaces
516+
# print("Now migrating spaces.")
517+
# for user_v1 in users_v1:
518+
# print(f"Migrating spaces of user {user_v1['email']}")
519+
# user_v1_spaces = get_clowder_v1_user_spaces(user_v1)
520+
# user_v2_api_key = USER_MAP[user_v1["id"]]
521+
# for space in user_v1_spaces:
522+
# group_id = create_v2_group(space, headers={"X-API-key": user_v2_api_key})
523+
# add_v1_space_members_to_v2_group(
524+
# space, group_id, headers={"X-API-key": user_v2_api_key}
525+
# )
526+
# space_datasets = get_clowder_v2_space_datasets(space["id"])
527+
# for space_dataset in space_datasets:
528+
# dataset_v2_id = DATASET_MAP[space_dataset["id"]]
529+
# share_dataset_with_group(
530+
# group_id, space, headers={"X-API-key": user_v2_api_key}
531+
# )
532+
# print(f"Migrated spaces of user {user_v1['email']}")
377533
print("Migration complete.")

0 commit comments

Comments
 (0)