Skip to content

Commit b600cc9

Browse files
committed
new classes (to be consolidated later)
to make it easier to map datasets to collections and their parent hierarchcy, we now will generate a json that will help with lookup. this will be run BEFORE the migration script
1 parent 21629c7 commit b600cc9

File tree

4 files changed

+275
-23
lines changed

4 files changed

+275
-23
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
from datetime import datetime
3+
import json
4+
import requests
5+
from dotenv import dotenv_values
6+
7+
try:
8+
import tomllib # Python 3.11+
9+
except ImportError:
10+
import tomli as tomllib
11+
12+
13+
14+
DEFAULT_PASSWORD = "Password123&"
15+
16+
# Get the current timestamp
17+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
18+
COLLECTIONS_FILE = "collections_datasets.json"
19+
20+
def get_dataset_collections_map():
21+
print("Getting collections and datasets from Clowder v1...")
22+
23+
with open(COLLECTIONS_FILE, "r") as jf:
24+
data = json.load(jf)
25+
print(f"Loaded {len(data)} collections from {COLLECTIONS_FILE}")
26+
dataset_to_collection = {}
27+
28+
for collection, datasets in data.items():
29+
for dataset in datasets:
30+
dataset_to_collection[dataset] = collection
31+
return dataset_to_collection
32+
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import os
2+
from datetime import datetime
3+
4+
import requests
5+
from dotenv import dotenv_values
6+
7+
try:
8+
import tomllib # Python 3.11+
9+
except ImportError:
10+
import tomli as tomllib
11+
12+
13+
from scripts.migration.migrate_metadata_definitions import (
14+
check_metadata_definition_exists,
15+
get_clowder_v1_metadata_definitions,
16+
post_metadata_definition,
17+
)
18+
19+
# Configuration and Constants
20+
DEFAULT_PASSWORD = "Password123&"
21+
22+
# Get the current timestamp
23+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
24+
OUTPUT_FILE = "collections_ids.txt"
25+
26+
# Load environment variables
27+
path_to_env = os.path.join(os.getcwd(),"scripts","migration", ".env")
28+
config = dotenv_values(dotenv_path=path_to_env)
29+
30+
31+
CLOWDER_V1 = config["CLOWDER_V1"]
32+
ADMIN_KEY_V1 = config["ADMIN_KEY_V1"]
33+
CLOWDER_V2 = config["CLOWDER_V2"]
34+
ADMIN_KEY_V2 = config["ADMIN_KEY_V2"]
35+
36+
if not CLOWDER_V1 or not ADMIN_KEY_V1 or not CLOWDER_V2 or not ADMIN_KEY_V2:
37+
print("MISSING SOME ENVIRONMENT VARIABLES")
38+
else:
39+
print("WE HAVE THEM ALL")
40+
41+
base_headers_v1 = {"X-API-key": ADMIN_KEY_V1}
42+
base_headers_v2 = {"X-API-key": ADMIN_KEY_V2}
43+
44+
clowder_headers_v1 = {
45+
**base_headers_v1,
46+
"Content-type": "application/json",
47+
"accept": "application/json",
48+
}
49+
50+
clowder_headers_v2 = {
51+
**base_headers_v2,
52+
"Content-type": "application/json",
53+
"accept": "application/json",
54+
}
55+
56+
admin_user = {
57+
"email": "[email protected]",
58+
"password": "admin",
59+
"first_name": "admin",
60+
"last_name": "admin",
61+
}
62+
63+
def get_clowder_v1_top_level_collections(headers):
64+
endpoint = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
65+
response = requests.get(endpoint, headers=headers)
66+
user_collections = response.json()
67+
return user_collections
68+
69+
def get_collection_v1_descendants(headers, collection_id):
70+
descendant_ids = []
71+
72+
collection_endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
73+
response = requests.get(collection_endpoint, headers=headers, verify=False)
74+
collection_json = response.json()
75+
print(collection_json["child_collection_ids"])
76+
if int(collection_json["childCollectionsCount"]) > 0:
77+
child_collections_ids = collection_json["child_collection_ids"]
78+
descendant_ids = child_collections_ids[5:-1].split(', ')
79+
for i in range(0, len(descendant_ids)):
80+
id = descendant_ids[i]
81+
descendent_endpoint = f"{CLOWDER_V1}/api/collections/{id}"
82+
descendent_response = requests.get(descendent_endpoint, headers=headers, verify=False)
83+
descendent_json = descendent_response.json()
84+
if int(descendent_json["childCollectionsCount"]) > 0:
85+
sub_descendants = get_collection_v1_descendants(headers, id)
86+
descendant_ids.extend(sub_descendants)
87+
return descendant_ids
88+
89+
def get_dataset_ids_in_v1_collection(headers, collection_id):
90+
dataset_ids = []
91+
collection_endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}/datasets"
92+
response = requests.get(collection_endpoint, headers=headers, verify=False)
93+
datasets_json = response.json()
94+
for dataset in datasets_json:
95+
dataset_ids.append(dataset["id"])
96+
return dataset_ids
97+
98+
if __name__ == "__main__":
99+
top_level_collections = get_clowder_v1_top_level_collections(clowder_headers_v1)
100+
all_v1_collections = []
101+
for collection in top_level_collections:
102+
print(f"Getting descendents for collection {collection['name']} ({collection['id']})")
103+
all_v1_collections.append(collection["id"])
104+
if int(collection["childCollectionsCount"]) > 0:
105+
descendant_ids = get_collection_v1_descendants(clowder_headers_v1, collection["id"])
106+
all_v1_collections.extend(descendant_ids)
107+
print(f"Added descendents for collection {collection['name']} ({collection['id']})")
108+
109+
110+
print(f"TOTAL V1 COLLECTIONS TO MIGRATE: {len(all_v1_collections)}")
111+
112+
with open(OUTPUT_FILE, "w") as outfile:
113+
for v1_collection in all_v1_collections:
114+
outfile.write(v1_collection + "\n")
115+
print(f"Migration complete. New users logged to {OUTPUT_FILE}")
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
from datetime import datetime
3+
import json
4+
import requests
5+
from dotenv import dotenv_values
6+
7+
try:
8+
import tomllib # Python 3.11+
9+
except ImportError:
10+
import tomli as tomllib
11+
12+
13+
14+
DEFAULT_PASSWORD = "Password123&"
15+
16+
# Get the current timestamp
17+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
18+
COLLECTIONS_FILE = "collections_ids.txt"
19+
20+
21+
22+
# Load environment variables
23+
path_to_env = os.path.join(os.getcwd(),"scripts","migration", ".env")
24+
config = dotenv_values(dotenv_path=path_to_env)
25+
26+
27+
CLOWDER_V1 = config["CLOWDER_V1"]
28+
ADMIN_KEY_V1 = config["ADMIN_KEY_V1"]
29+
CLOWDER_V2 = config["CLOWDER_V2"]
30+
ADMIN_KEY_V2 = config["ADMIN_KEY_V2"]
31+
32+
if not CLOWDER_V1 or not ADMIN_KEY_V1 or not CLOWDER_V2 or not ADMIN_KEY_V2:
33+
print("MISSING SOME ENVIRONMENT VARIABLES")
34+
else:
35+
print("WE HAVE THEM ALL")
36+
37+
base_headers_v1 = {"X-API-key": ADMIN_KEY_V1}
38+
base_headers_v2 = {"X-API-key": ADMIN_KEY_V2}
39+
40+
clowder_headers_v1 = {
41+
**base_headers_v1,
42+
"Content-type": "application/json",
43+
"accept": "application/json",
44+
}
45+
46+
clowder_headers_v2 = {
47+
**base_headers_v2,
48+
"Content-type": "application/json",
49+
"accept": "application/json",
50+
}
51+
52+
admin_user = {
53+
"email": "[email protected]",
54+
"password": "admin",
55+
"first_name": "admin",
56+
"last_name": "admin",
57+
}
58+
59+
def get_collections_datasets(headers, collection_id):
60+
collection_dataset_endpoint = (
61+
f"{CLOWDER_V1}/api/collections/{collection_id}/datasets?superAdmin=true"
62+
)
63+
collection_dataset_response = requests.get(
64+
collection_dataset_endpoint, headers=headers
65+
)
66+
collection_dataset_json = collection_dataset_response.json()
67+
return collection_dataset_json
68+
69+
70+
if __name__ == "__main__":
71+
print("Getting collections and datasets from Clowder v1...")
72+
73+
collection_ids =[]
74+
if os.path.exists(COLLECTIONS_FILE):
75+
print('exists')
76+
else:
77+
print('does not exist')
78+
79+
with open(COLLECTIONS_FILE, "r") as outfile:
80+
lines = outfile.readlines()
81+
for line in lines:
82+
collection_ids.append(line.rstrip('\n'))
83+
print(f"Found {len(collection_ids)} collections in {COLLECTIONS_FILE}")
84+
collection_dataset_dict = dict()
85+
for id in collection_ids:
86+
print(f"Getting datasets for collection id {id}...")
87+
datasets = get_collections_datasets(clowder_headers_v1, id)
88+
if len(datasets) > 0:
89+
dataset_ids = []
90+
for ds in datasets:
91+
dataset_ids.append(ds["id"])
92+
collection_dataset_dict[id] = dataset_ids
93+
94+
json_file = "collections_datasets.json"
95+
with open(json_file, "w") as jf:
96+
json.dump(collection_dataset_dict, jf)
97+
print("dumped to a file")

scripts/migration/migrate.py

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,13 @@ def get_collection_v1_descendants(headers, collection_id):
156156
child_collections_ids = collection_json["child_collection_ids"]
157157
descendant_ids = child_collections_ids[5:-1].split(', ')
158158
for id in descendant_ids:
159-
sub_descendants = get_collection_v1_descendants(headers, id)
160-
descendant_ids.extend(sub_descendants)
159+
descendent_endpoint = f"{CLOWDER_V1}/api/collections/{id}"
160+
descendent_response = requests.get(descendent_endpoint, headers=headers, verify=False)
161+
descendent_json = descendent_response.json()
162+
if int(descendent_json["childCollectionsCount"]) > 0:
163+
sub_descendants = get_collection_v1_descendants(headers, id)
164+
descendant_ids.extend(sub_descendants)
161165
return descendant_ids
162-
print('we got collection')
163166

164167
def get_clowder_v1_user_collections(headers, user_v1):
165168
endpoint = f"{CLOWDER_V1}/api/collections"
@@ -169,23 +172,10 @@ def get_clowder_v1_user_collections(headers, user_v1):
169172
# TODO this is too slow, we need to optimize it
170173
def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
171174
matching_collections = []
172-
endpoint1 = f"{CLOWDER_V1}/api/collections/rootCollections?superAdmin=true"
173-
# use this one below
174-
endpint2 = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
175-
response = requests.get(endpoint1, headers=headers)
176-
response2 = requests.get(endpint2, headers=headers)
175+
endpoint = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
176+
response = requests.get(endpoint, headers=headers)
177177
user_collections = response.json()
178-
user_collections_ids = []
179-
user_collections_ids_2 = []
180-
user_collections_2 = response2.json()
181-
for collection in user_collections_2:
182-
id = collection['id']
183-
descendants = get_collection_v1_descendants(headers, id)
184-
# test_descendants = get_collection_v1_descendants(headers, "68a34b28e4b0cc7386c091a4")
185-
# TODO check here if the dataset is in a descendant
186-
print('got descendants')
187-
for collection in user_collections:
188-
user_collections_ids.append(collection['id'])
178+
189179
for collection in user_collections:
190180
collection_id = collection["id"]
191181
collection_dataset_endpoint = (
@@ -198,9 +188,24 @@ def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
198188
datasets = dataset_response.json()
199189
for ds in datasets:
200190
if ds["id"] == dataset_id:
201-
matching_collections.append(collection)
191+
if collection not in matching_collections:
192+
matching_collections.append(collection)
202193
except Exception as e:
203194
print("Exception", e)
195+
if int(collection["childCollectionsCount"]) > 0:
196+
collection_descendants = get_collection_v1_descendants(headers, collection_id)
197+
for descendant in collection_descendants:
198+
collection_dataset_endpoint = (
199+
f"{CLOWDER_V1}/api/collections/{descendant}/datasets?superAdmin=true"
200+
)
201+
collection_dataset_response = requests.get(
202+
collection_dataset_endpoint, headers=headers
203+
)
204+
collection_dataset_json = collection_dataset_response.json()
205+
for ds in collection_dataset_json:
206+
if ds['id'] == dataset_id:
207+
if descendant not in matching_collections:
208+
matching_collections.append(descendant)
204209
return matching_collections
205210

206211

@@ -826,9 +831,12 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
826831
if file_v2_id is not None:
827832
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
828833
# posting the collection hierarchy as metadata
829-
collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
830-
dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
831-
)
834+
try:
835+
collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
836+
dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
837+
)
838+
except Exception as e:
839+
print(e)
832840
migration_extractor_collection_metadata = {
833841
"listener": {
834842
"name": "migration",

0 commit comments

Comments
 (0)