Skip to content

Commit 73609d2

Browse files
committed
get top level collections
these will become datasets
1 parent dbe3016 commit 73609d2

File tree

1 file changed

+130
-2
lines changed

1 file changed

+130
-2
lines changed

scripts/migration/migrate.py

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,22 @@ def get_clowder_v1_user_collections(headers, user_v1):
173173
response = requests.get(endpoint, headers=headers)
174174
return [col for col in response.json() if col["authorId"] == user_v1["id"]]
175175

176+
177+
def get_clowder_v1_user_collections_top_level(headers, user_v1):
178+
top_level_collections = []
179+
180+
endpoint = f"{CLOWDER_V1}/api/collections/topLevelCollections"
181+
response = requests.get(endpoint, headers=headers)
182+
response_json = response.json()
183+
for col in response_json:
184+
author = col["author"]
185+
author_id = author.lstrip('MiniUser(')
186+
author_id = author_id[:author_id.index(',')]
187+
if author_id == user_v1["id"]:
188+
top_level_collections.append(col)
189+
return top_level_collections
190+
191+
176192
# TODO this is too slow, we need to optimize it
177193
def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
178194
matching_collections = []
@@ -769,6 +785,117 @@ def build_collection_space_metadata_for_v1_dataset(dataset, user_v1, headers):
769785
print(f"Got space and collection metadata from dataset {dataset_id}")
770786
return metadata
771787

788+
def process_user_and_resources_collections(user_v1, USER_MAP, DATASET_MAP, COLLETIONS_MAP):
789+
"""Process user resources from Clowder v1 to Clowder v2."""
790+
791+
# get collections of the user
792+
793+
user_v1_datasets = get_clowder_v1_user_datasets(user_id=user_v1["id"])
794+
user_v1_datasets = user_v1_datasets[:2]
795+
user_v2_api_key = create_local_user(user_v1)
796+
USER_MAP[user_v1["id"]] = user_v2_api_key
797+
base_user_headers_v2 = {"x-api-key": user_v2_api_key}
798+
user_headers_v2 = {
799+
"x-api-key": user_v2_api_key,
800+
"content-type": "application/json",
801+
"accept": "application/json",
802+
}
803+
804+
user_v1_collections = get_clowder_v1_user_collections_top_level(
805+
headers=clowder_headers_v1, user_v1=user_v1
806+
)
807+
808+
print(f"Got {len(user_v1_collections)} user collections in the top level")
809+
810+
for dataset in user_v1_datasets:
811+
print(f"Creating dataset in v2: {dataset['id']} - {dataset['name']}")
812+
# TODO: check if dataset is in toml_exclude_dataset_id
813+
dataset_v1_id = dataset["id"]
814+
dataset_v1_spaces = dataset["spaces"]
815+
# TODO check if dataset is in toml_space_ids or exclude_space_ids
816+
MIGRATE_DATASET = True
817+
print(toml_exclude_dataset_ids)
818+
print(toml_space_ids)
819+
print(toml_exclude_space_ids)
820+
# Check if dataset is in the excluded dataset list
821+
if dataset_v1_id in toml_exclude_dataset_ids:
822+
print(f"Skipping dataset {dataset_v1_id} as it is in the exclude list.")
823+
MIGRATE_DATASET = False
824+
# Check if dataset is in the specified space list
825+
if toml_space_ids is not None and len(toml_space_ids) > 0:
826+
if not any(
827+
space_id in dataset_v1_spaces for space_id in toml_space_ids
828+
):
829+
print(
830+
f"Skipping dataset {dataset_v1_id} as it is not in the specified spaces."
831+
)
832+
MIGRATE_DATASET = False
833+
if toml_exclude_space_ids is not None and len(toml_exclude_space_ids) > 0:
834+
if any(
835+
space_id in dataset_v1_spaces for space_id in toml_exclude_space_ids
836+
):
837+
print(
838+
f"Skipping dataset {dataset_v1_id} as it is in the excluded spaces."
839+
)
840+
MIGRATE_DATASET = False
841+
if MIGRATE_DATASET:
842+
dataset_v2_id = create_v2_dataset(dataset, user_headers_v2)
843+
DATASET_MAP[dataset["id"]] = dataset_v2_id
844+
add_dataset_metadata(dataset, dataset_v2_id, base_headers_v1, user_headers_v2)
845+
add_dataset_folders(dataset, dataset_v2_id, user_headers_v2)
846+
print("Created folders in the new dataset")
847+
848+
all_dataset_folders = get_folder_and_subfolders(dataset_v2_id, user_headers_v2)
849+
850+
# Retrieve files for the dataset in Clowder v1
851+
dataset_files_endpoint = (
852+
f"{CLOWDER_V1}/api/datasets/{dataset['id']}/files?superAdmin=true"
853+
)
854+
files_response = requests.get(
855+
dataset_files_endpoint, headers=clowder_headers_v1, verify=False
856+
)
857+
files_result = files_response.json()
858+
859+
for file in files_result:
860+
file_v2_id = download_and_upload_file(
861+
file, all_dataset_folders, dataset_v2_id, base_user_headers_v2
862+
)
863+
if file_v2_id is not None:
864+
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
865+
# posting the collection hierarchy as metadata
866+
try:
867+
collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
868+
dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
869+
)
870+
except Exception as e:
871+
print(e)
872+
migration_extractor_collection_metadata = {
873+
"listener": {
874+
"name": "migration",
875+
"version": "1",
876+
"description": "Migration of metadata from Clowder v1 to Clowder v2",
877+
},
878+
"context_url": "https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld",
879+
"content": collection_space_metadata_dict,
880+
"contents": collection_space_metadata_dict,
881+
}
882+
v2_metadata_endpoint = f"{CLOWDER_V2}/api/v2/datasets/{dataset_v2_id}/metadata"
883+
response = requests.post(
884+
v2_metadata_endpoint,
885+
json=migration_extractor_collection_metadata,
886+
headers=clowder_headers_v2,
887+
)
888+
if response.status_code == 200:
889+
print("Successfully added collection info as metadata in v2.")
890+
else:
891+
print(
892+
f"Failed to add collection info as metadata in Clowder v2. Status code: {response.status_code}"
893+
)
894+
else:
895+
print(f"Skipping dataset {dataset_v1_id} as it does not meet the criteria.")
896+
897+
return [USER_MAP, DATASET_MAP]
898+
772899

773900
def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
774901
"""Process user resources from Clowder v1 to Clowder v2."""
@@ -891,6 +1018,7 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
8911018
# migrate users and resources
8921019
USER_MAP = {}
8931020
DATASET_MAP = {}
1021+
COLLECTIONS_MAP = {}
8941022
users_v1 = get_clowder_v1_users()
8951023
# TODO filter if toml users
8961024
if toml_users is not None and len(toml_users) > 0:
@@ -905,8 +1033,8 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
9051033
"[Local Account]" in user_v1["identityProvider"]
9061034
and user_v1["email"] != admin_user["email"]
9071035
):
908-
[USER_MAP, DATASET_MAP] = process_user_and_resources(
909-
user_v1, USER_MAP, DATASET_MAP
1036+
[USER_MAP, DATASET_MAP] = process_user_and_resources_collections(
1037+
user_v1, USER_MAP, DATASET_MAP, COLLECTIONS_MAP
9101038
)
9111039
print(f"Migrated user {user_v1['email']} and associated resources.")
9121040
else:

0 commit comments

Comments
 (0)