Skip to content

Commit 4e68f21

Browse files
tcnicholddey2longshuicy
authored
1187 migrate collections and spaces as v2 metadata. (#1190)
* match other branch * adding new collection metadata * str not float * getting collections for a dataset in v1, fixing metadata for collections * posts collection name and id * adding routes for getting collections * making a method like the one in v1 for self and ancestors. it will be easier to build a collection hierarchy from this * sample json for mdata * posts collection name and id * building the data for collections * something works now * matching with other branch * methods for migrating collections as metadata * need to post it as metadata * change name * adding the metadata for collections * adding context url and right endpoint * getting spaces as well as collections * change name * remaning method * created v2 license based on v1 license details (#1193) Co-authored-by: Chen Wang <[email protected]> * removing print statements * better error logging --------- Co-authored-by: Dipannita <[email protected]> Co-authored-by: Chen Wang <[email protected]>
1 parent 2a52c7c commit 4e68f21

File tree

2 files changed

+339
-1
lines changed

2 files changed

+339
-1
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"name" : "Collection",
3+
"description" : "Collection information from v1",
4+
"required_for_items": {
5+
"datasets": false,
6+
"files": false
7+
},
8+
"context" : [
9+
{
10+
"collection_name" : "https://schema.org/colname",
11+
"collection_id" : "https://schema.org/colid",
12+
"parent_collection_name": "https://schema.org/parentcolname",
13+
"parent_collection_id": "https://schema.org/parentcolid"
14+
}
15+
],
16+
"fields" : [
17+
{
18+
"name" : "collection_name",
19+
"list" : false,
20+
"widgetType": "TextField",
21+
"config": {
22+
"type" : "str"
23+
},
24+
"required" : false
25+
},
26+
{
27+
"name" : "collection_id",
28+
"list" : false,
29+
"widgetType": "TextField",
30+
"config": {
31+
"type" : "str"
32+
},
33+
"required" : false
34+
},
35+
{
36+
"name" : "parent_collection_name",
37+
"list" : false,
38+
"widgetType": "TextField",
39+
"config": {
40+
"type" : "str"
41+
},
42+
"required" : false
43+
},
44+
{
45+
"name" : "parent_collection_id",
46+
"list" : false,
47+
"widgetType": "TextField",
48+
"config": {
49+
"type" : "str"
50+
},
51+
"required" : false
52+
}
53+
]
54+
}

scripts/migration/migrate.py

Lines changed: 285 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,102 @@ def add_v1_space_members_to_v2_group(space, group_id, headers):
117117
)
118118

119119

120+
def get_clowder_v1_user_collections(headers, user_v1):
121+
endpoint = f"{CLOWDER_V1}/api/collections"
122+
response = requests.get(endpoint, headers=headers)
123+
return [col for col in response.json() if col["authorId"] == user_v1["id"]]
124+
125+
126+
def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
127+
matching_collections = []
128+
endpoint = f"{CLOWDER_V1}/api/collections/allCollections"
129+
response = requests.get(endpoint, headers=headers)
130+
user_collections = response.json()
131+
for collection in user_collections:
132+
collection_id = collection["id"]
133+
collection_dataset_endpoint = (
134+
f"{CLOWDER_V1}/api/collections/{collection_id}/datasets"
135+
)
136+
try:
137+
dataset_response = requests.get(
138+
collection_dataset_endpoint, headers=headers
139+
)
140+
datasets = dataset_response.json()
141+
for ds in datasets:
142+
if ds["id"] == dataset_id:
143+
matching_collections.append(collection)
144+
except Exception as e:
145+
print("Exception", e)
146+
return matching_collections
147+
148+
149+
def get_clowder_v1_collection(collection_id, headers):
150+
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
151+
response = requests.get(endpoint, headers=headers)
152+
return response.json()
153+
154+
155+
def get_clowder_v1_collections(collection_ids, headers):
156+
collections = []
157+
for collection_id in collection_ids:
158+
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
159+
response = requests.get(endpoint, headers=headers)
160+
collections.append(response.json())
161+
return collections
162+
163+
164+
def get_clowder_v1_collection_self_and_ancestors(
165+
collection_id, self_and_ancestors, headers
166+
):
167+
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
168+
response = requests.get(endpoint, headers=headers)
169+
self = response.json()
170+
if self["id"] not in self_and_ancestors:
171+
self_and_ancestors.append(self["id"])
172+
parents_entry = self["parent_collection_ids"]
173+
parents_entry = parents_entry.lstrip("List(")
174+
parents_entry = parents_entry.rstrip(")")
175+
if parents_entry != "":
176+
parents = parents_entry.split(",")
177+
for parent in parents:
178+
# replace empty space
179+
parent = parent.lstrip(" ")
180+
parent = parent.rstrip(" ")
181+
if parent not in self_and_ancestors:
182+
self_and_ancestors.append(parent)
183+
for parent in parents:
184+
parent = parent.lstrip(" ")
185+
parent = parent.rstrip(" ")
186+
if parent != "" and parent is not None:
187+
current_self_and_ancestors = (
188+
get_clowder_v1_collection_self_and_ancestors(
189+
parent, self_and_ancestors, headers=headers
190+
)
191+
)
192+
for col_id in current_self_and_ancestors:
193+
if col_id not in self_and_ancestors:
194+
self_and_ancestors.append(col_id)
195+
return self_and_ancestors
196+
197+
198+
def get_clowder_v1_parent_collection_ids(current_collection_id, headers):
199+
parents = []
200+
all_collections_v1_endpoint = (
201+
f"{CLOWDER_V1}/api/collections/allCollections?limit=0&showAll=true"
202+
)
203+
response = requests.get(all_collections_v1_endpoint, headers=headers)
204+
all_collections = response.json()
205+
for collection in all_collections:
206+
children_entry = collection["child_collection_ids"]
207+
children_entry = children_entry.lstrip("List(")
208+
children_entry = children_entry.rstrip(")")
209+
child_ids = children_entry.split(",")
210+
for child in child_ids:
211+
if child == current_collection_id:
212+
parents.append(collection["id"])
213+
return parents
214+
215+
120216
def create_local_user(user_v1):
121217
"""Create a local user in Clowder v2 if they don't already exist, and generate an API key."""
122218
# Search for the user by email
@@ -169,10 +265,74 @@ def create_admin_user():
169265
return generate_user_api_key(admin_user, admin_user["password"])
170266

171267

268+
def add_dataset_license(v1_license, headers):
269+
"""Create appropriate license (standard/custom) based on v1 license details"""
270+
license_id = "CC-BY"
271+
# standard licenses
272+
if v1_license["license_type"] == "license2":
273+
if (
274+
not v1_license["ccAllowCommercial"]
275+
and not v1_license["ccAllowDerivative"]
276+
and not v1_license["ccRequireShareAlike"]
277+
):
278+
license_id = "CC BY-NC-ND"
279+
elif (
280+
v1_license["ccAllowCommercial"]
281+
and not v1_license["ccAllowDerivative"]
282+
and not v1_license["ccRequireShareAlike"]
283+
):
284+
license_id = "CC BY-ND"
285+
elif (
286+
not v1_license["ccAllowCommercial"]
287+
and v1_license["ccAllowDerivative"]
288+
and not v1_license["ccRequireShareAlike"]
289+
):
290+
license_id = "CC BY-NC"
291+
elif (
292+
not v1_license["ccAllowCommercial"]
293+
and v1_license["ccAllowDerivative"]
294+
and v1_license["ccRequireShareAlike"]
295+
):
296+
license_id = "CC BY-NC-SA"
297+
elif (
298+
v1_license["ccAllowCommercial"]
299+
and v1_license["ccAllowDerivative"]
300+
and v1_license["ccRequireShareAlike"]
301+
):
302+
license_id = "CC BY-SA"
303+
elif (
304+
v1_license["ccAllowCommercial"]
305+
and v1_license["ccAllowDerivative"]
306+
and not v1_license["ccRequireShareAlike"]
307+
):
308+
license_id = "CC BY"
309+
elif v1_license["license_type"] == "license3":
310+
license_id = "CCO Public Domain Dedication"
311+
else:
312+
# custom license
313+
license_body = {
314+
"name": v1_license["license_text"],
315+
"url": v1_license["license_url"],
316+
"holders": v1_license["holders"],
317+
}
318+
if license_body["url"] == "":
319+
license_body["url"] = "https://dbpedia.org/page/All_rights_reserved"
320+
license_v2_endpoint = f"{CLOWDER_V2}/api/v2/licenses?"
321+
response = requests.post(
322+
license_v2_endpoint, headers=headers, json=license_body
323+
)
324+
print(response.json())
325+
license_id = response.json()["id"]
326+
return license_id
327+
328+
172329
def create_v2_dataset(dataset, headers):
173330
"""Create a dataset in Clowder v2."""
174331
# TODO: GET correct license
175-
dataset_in_v2_endpoint = f"{CLOWDER_V2}/api/v2/datasets?license_id=CC BY"
332+
print("Creating dataset license in Clowder v2.")
333+
v2_license_id = add_dataset_license(dataset["license"], headers)
334+
335+
dataset_in_v2_endpoint = f"{CLOWDER_V2}/api/v2/datasets?license_id={v2_license_id}"
176336
dataset_example = {
177337
"name": dataset["name"],
178338
"description": dataset["description"],
@@ -439,6 +599,101 @@ def register_migration_extractor():
439599
)
440600

441601

602+
def add_children(collection_hierarchy_json, remaining_collections):
603+
new_json = []
604+
new_remaining_collections = []
605+
for collection in remaining_collections:
606+
collection_parents = collection["parent_collection_ids"]
607+
current_collection_parents = []
608+
for entry in collection_hierarchy_json:
609+
if entry["id"] in collection_parents:
610+
current_collection_parents.append(entry)
611+
print("We got the parents now")
612+
if len(current_collection_parents) > 0:
613+
current_collection_entry = {
614+
"id": collection["id"],
615+
"name": collection["name"],
616+
"parents": current_collection_parents,
617+
}
618+
new_json.append(current_collection_entry)
619+
else:
620+
new_remaining_collections.append(collection)
621+
return new_json, new_remaining_collections
622+
623+
624+
def build_collection_hierarchy(collection_id, headers):
625+
self_and_ancestors = get_clowder_v1_collection_self_and_ancestors(
626+
collection_id=collection_id, self_and_ancestors=[], headers=headers
627+
)
628+
self_and_ancestors_collections = get_clowder_v1_collections(
629+
self_and_ancestors, headers=clowder_headers_v1
630+
)
631+
children = []
632+
remaining_collections = []
633+
for col in self_and_ancestors_collections:
634+
parent_collection_ids = col["parent_collection_ids"]
635+
parent_collection_ids = parent_collection_ids.lstrip("List(")
636+
parent_collection_ids = parent_collection_ids.rstrip(")")
637+
parent_collection_ids = parent_collection_ids.lstrip(" ")
638+
parent_collection_ids = parent_collection_ids.rstrip(" ")
639+
if parent_collection_ids == "":
640+
root_col_entry = {"name": col["name"], "id": col["id"], "parents": []}
641+
children.append(root_col_entry)
642+
else:
643+
remaining_collections.append(col)
644+
645+
while len(remaining_collections) > 0:
646+
children, remaining_collections = add_children(children, remaining_collections)
647+
print("Now we are done")
648+
return children
649+
650+
651+
def build_collection_metadata_for_v1_dataset(dataset_id, user_v1, headers):
652+
dataset_collections = get_clowder_v1_dataset_collections(
653+
headers=headers, user_v1=user_v1, dataset_id=dataset_id
654+
)
655+
return dataset_collections
656+
657+
658+
def build_collection_space_metadata_for_v1_dataset(dataset, user_v1, headers):
659+
dataset_id = dataset["id"]
660+
dataset_collections = get_clowder_v1_dataset_collections(
661+
headers=headers, user_v1=user_v1, dataset_id=dataset_id
662+
)
663+
dataset_spaces = dataset["spaces"]
664+
space_entries = []
665+
for space_id in dataset_spaces:
666+
space_endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}"
667+
response = requests.get(space_endpoint, headers=headers)
668+
space = response.json()
669+
try:
670+
space_entry = {
671+
"id": space["id"],
672+
"name": space["name"],
673+
"creator": space["creator"],
674+
}
675+
space_entries.append(space_entry)
676+
except Exception as e:
677+
print(f"Error in getting space entry.")
678+
print(e)
679+
try:
680+
space_entry = {"id": space["id"], "name": space["name"]}
681+
space_entries.append(space_entry)
682+
except Exception as e:
683+
print(f"Error in getting space entry")
684+
print(e)
685+
collection_data = []
686+
for collection in dataset_collections:
687+
collection_children = build_collection_hierarchy(
688+
collection_id=collection["id"], headers=headers
689+
)
690+
for child in collection_children:
691+
collection_data.append(child)
692+
metadata = {"spaces": space_entries, "collections": collection_data}
693+
print(f"Got space and collection metadata from dataset {dataset_id}")
694+
return metadata
695+
696+
442697
def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
443698
"""Process user resources from Clowder v1 to Clowder v2."""
444699
user_v1_datasets = get_clowder_v1_user_datasets(user_id=user_v1["id"])
@@ -476,6 +731,35 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
476731
)
477732
if file_v2_id is not None:
478733
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
734+
# posting the collection hierarchy as metadata
735+
collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
736+
dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
737+
)
738+
migration_extractor_collection_metadata = {
739+
"listener": {
740+
"name": "migration",
741+
"version": "1",
742+
"description": "Migration of metadata from Clowder v1 to Clowder v2",
743+
},
744+
"context_url": "https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld",
745+
"content": collection_space_metadata_dict,
746+
"contents": collection_space_metadata_dict,
747+
}
748+
v2_metadata_endpoint = f"{CLOWDER_V2}/api/v2/datasets/{dataset_v2_id}/metadata"
749+
response = requests.post(
750+
v2_metadata_endpoint,
751+
json=migration_extractor_collection_metadata,
752+
headers=clowder_headers_v2,
753+
)
754+
if response.status_code == 200:
755+
print("Successfully added collection info as metadata in v2.")
756+
else:
757+
print(
758+
f"Failed to add collection info as metadata in Clowder v2. Status code: {response.status_code}"
759+
)
760+
761+
if file_v2_id is not None:
762+
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
479763

480764
return [USER_MAP, DATASET_MAP]
481765

0 commit comments

Comments
 (0)