Skip to content

Commit c45d8f0

Browse files
committed
worked for one collection with datasets and folders
1 parent c76e75e commit c45d8f0

File tree

1 file changed

+47
-30
lines changed

1 file changed

+47
-30
lines changed

scripts/migration/migrate.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def process_collection_descendants(collection, headers_v1, base_headers_v2, head
197197
child_col_json = child_col_response.json()
198198
dataset_json = dataset_response.json()
199199

200-
print(f"Got child collections and datasets")
200+
# the below handles creating folders for child collections
201201
for child in child_col_json:
202202
if v2_parent_type == "dataset":
203203
print(f"Add folder to the dataset")
@@ -211,20 +211,14 @@ def process_collection_descendants(collection, headers_v1, base_headers_v2, head
211211
new_folder = create_folder_if_not_exists_or_get(folder_name, v2_parent_id, v2_dataset_id, headers_v2)
212212
process_collection_descendants(child, headers_v1, base_headers_v2, headers_v2, new_folder['id'], 'folder', v2_dataset_id)
213213

214+
# this handles uploading the datasets of the collection as folders
214215
for dataset in dataset_json:
215216
if v2_parent_type == "dataset":
216-
print(f"Parent is a dataset")
217-
new_folder = create_folder_if_not_exists_or_get(dataset["name"], v2_parent_id, v2_dataset_id, headers_v2)
218-
print(f"Now we need to add the sub folders of this dataset")
219-
# TODO get DATASET FOLDERS HERE FROM v1
220-
process_dataset_folders(dataset, headers_v1, headers_v2, new_folder['id'], v2_dataset_id)
221-
process_dataset_files(dataset, headers_v1, base_headers_v2, 'folder', new_folder['id'], v2_dataset_id)
217+
new_folder = create_folder_if_not_exists_or_get(dataset["name"], v2_parent_id, v2_parent_type, v2_dataset_id, headers_v2)
218+
process_dataset_files_and_folders(dataset, headers_v1, base_headers_v2, 'folder', new_folder['id'], v2_dataset_id, new_folder)
222219
else:
223-
print(f"Parent is a folder")
224-
new_folder = create_folder_if_not_exists_or_get(dataset["name"], v2_parent_id, v2_dataset_id, headers_v2)
225-
# TODO GET DATASET FOLDERS HERE FROM v1
226-
process_dataset_folders(dataset, headers_v1, headers_v2, new_folder['id'], v2_dataset_id)
227-
process_dataset_files(dataset, headers_v1, base_headers_v2, 'folder', new_folder['id'], v2_dataset_id)
220+
new_folder = create_folder_if_not_exists_or_get(dataset["name"], v2_parent_id, v2_parent_type, v2_dataset_id, headers_v2)
221+
process_dataset_files_and_folders(dataset, headers_v1, base_headers_v2, 'folder', new_folder['id'], v2_dataset_id, new_folder)
228222

229223

230224

@@ -240,7 +234,8 @@ def get_v1_dataset_folders(dataset, headers_v1, headers_v2, parent_type, parent_
240234
folder_json = folder_response.json()
241235
return folder_json
242236

243-
def process_dataset_files(dataset, headers_v1, headers_v2, parent_type, parent_id, dataset_v2_id):
237+
# processes a dataset adds folders and
238+
def process_dataset_files_and_folders(dataset, headers_v1, headers_v2, parent_type, parent_id, dataset_v2_id, dataset_v2_folder):
244239
dataset_v1_folders = get_v1_dataset_folders(dataset, headers_v1, headers_v2, parent_type, parent_id)
245240

246241
for folder_v1 in dataset_v1_folders:
@@ -255,24 +250,19 @@ def process_dataset_files(dataset, headers_v1, headers_v2, parent_type, parent_i
255250
files_endpoint = f"{CLOWDER_V1}/api/datasets/{dataset['id']}/files"
256251
files_response = requests.get(files_endpoint, headers=headers_v1)
257252
files_json = files_response.json()
258-
# TODO WORK HERE
253+
# go through files and upload them to the correct folder if they have one
259254
for file in files_json:
260255
if 'folders' in file:
261-
print(f"This file is in a folder")
262-
current_file_folder_name = file['folders']['name']
263-
matching_folder = None
264256
for folder_v2 in all_v2_dataset_folders:
265257
if folder_v2['name'] == file['folders']['name']:
266258
print(f"Upload this file to a folder")
267259
matching_folder = folder_v2
268260
download_and_upload_file_to_matching_folder(file, dataset_v2_id, base_headers_v2, matching_folder)
269261
else:
270-
print(f"This file is not in a folder")
271-
# TODO upload it to the folder
272262
if parent_type == "dataset":
273263
print(f"Upload to a dataset")
274264
if parent_type == "folder":
275-
print(f"Upload to a folder")
265+
download_and_upload_file_to_matching_folder(file, dataset_v2_id, base_headers_v2, dataset_v2_folder)
276266
print(f"Got dataset files")
277267

278268

@@ -295,7 +285,10 @@ def create_v2_dataset_from_collection(collection, user_v1, headers_v1, headers_v
295285
new_dataset_json = response.json()
296286
v2_dataset_id = new_dataset_json["id"]
297287

298-
process_collection_descendants(collection, headers_v1, base_headers_v2, headers_v2, new_dataset_json["id"], "dataset", v2_dataset_id)
288+
process_collection_descendants(collection=collection, headers_v1=headers_v1,
289+
base_headers_v2=base_headers_v2, headers_v2= headers_v2,
290+
v2_parent_id=new_dataset_json["id"],
291+
v2_parent_type="dataset", v2_dataset_id=v2_dataset_id)
299292

300293
return response.json()["id"]
301294

@@ -560,7 +553,7 @@ def add_folder_hierarchy_to_migration_folder(folder_hierarchy, dataset_v2, folde
560553
current_parent = folder_id_v2
561554
for part in hierarchy_parts:
562555
result = create_folder_if_not_exists_or_get(
563-
part, current_parent, dataset_v2, headers
556+
part, current_parent, 'folder', dataset_v2, headers
564557
)
565558
if result:
566559
current_parent = result["id"]
@@ -579,13 +572,16 @@ def add_folder_hierarchy(folder_hierarchy, dataset_v2, headers):
579572
current_parent = result["id"]
580573

581574

582-
def create_folder_if_not_exists_or_get(folder, parent, dataset_v2, headers):
575+
def create_folder_if_not_exists_or_get(folder, parent, parent_type, dataset_v2, headers):
583576
"""Create a folder if it does not exist or return the existing folder."""
584577
# current_folders = get_folder_and_subfolders(dataset_v2, headers)
585578
current_all_folders = get_all_folder_and_subfolders(dataset_v2, headers)
586-
folder_data = (
587-
{"name": folder, "parent_folder": parent} if parent else {"name": folder}
588-
)
579+
if parent_type == 'folder':
580+
folder_data = (
581+
{"name": folder, "parent_folder": parent} if parent else {"name": folder}
582+
)
583+
else:
584+
folder_data = {"name": folder}
589585

590586
for existing_folder in current_all_folders:
591587
if existing_folder["name"] == folder:
@@ -838,6 +834,7 @@ def download_and_upload_file_to_matching_folder(file, dataset_v2_id, headers_v2,
838834
)
839835

840836
# Clean up the local file after upload
837+
print(f"Type response {type(response)}")
841838
try:
842839
os.remove(filename)
843840
except Exception as e:
@@ -846,7 +843,11 @@ def download_and_upload_file_to_matching_folder(file, dataset_v2_id, headers_v2,
846843

847844
if response.status_code == 200:
848845
print(f"Uploaded file: {filename} to dataset {dataset_v2_id}")
849-
return response.json().get("id")
846+
response_json = response.json()
847+
if type(response_json) == dict:
848+
return response.json().get("id")
849+
elif type(response_json) == list:
850+
return response_json[0].get("id")
850851
else:
851852
print(f"Failed to upload file: {filename} to dataset {dataset_v2_id}")
852853

@@ -1249,9 +1250,25 @@ def process_user_and_resources_collections(user_v1, USER_MAP, DATASET_MAP, COLLE
12491250

12501251
print(f"Got {len(user_v1_collections)} user collections in the top level")
12511252

1252-
for top_level_col in user_v1_collections:
1253-
dataset_v2 = create_v2_dataset_from_collection(top_level_col, user_v1, clowder_headers_v1 ,user_headers_v2, base_headers_v2)
1254-
print('did this')
1253+
# filter the collections by space
1254+
migrate_top_level_collections = []
1255+
for col in user_v1_collections:
1256+
collection_spaces = col["spaces"]
1257+
collection_spaces = collection_spaces.lstrip('List(')
1258+
collection_spaces = collection_spaces.rstrip(')')
1259+
collection_spaces = collection_spaces.split(',')
1260+
for space in collection_spaces:
1261+
if space in toml_space_ids:
1262+
migrate_top_level_collections.append(col)
1263+
break
1264+
1265+
# create datasets from the top level collections
1266+
for top_level_col in migrate_top_level_collections:
1267+
dataset_v2 = create_v2_dataset_from_collection(collection=top_level_col, user_v1=user_v1,
1268+
headers_v1=clowder_headers_v1 ,headers_v2=user_headers_v2,
1269+
base_headers_v2=base_headers_v2)
1270+
print(f"Created dataset in v2 from collection: {top_level_col['id']} - {top_level_col['name']}")
1271+
COLLETIONS_MAP[top_level_col["id"]] = dataset_v2
12551272

12561273
for dataset in user_v1_datasets:
12571274
print(f"Creating dataset in v2: {dataset['id']} - {dataset['name']}")

0 commit comments

Comments
 (0)