@@ -188,7 +188,7 @@ def get_clowder_v1_user_collections_top_level(headers, user_v1):
188188 top_level_collections .append (col )
189189 return top_level_collections
190190
191- def process_collection_descendants (collection , headers_v1 ,headers_v2 , v2_parent_id , v2_parent_type , v2_dataset_id ):
191+ def process_collection_descendants (collection , headers_v1 , base_headers_v2 , headers_v2 , v2_parent_id , v2_parent_type , v2_dataset_id ):
192192 child_collections_endpoint = f"{ CLOWDER_V1 } /api/collections/{ collection ['id' ]} /getChildCollections"
193193 datasets_endpoint = f"{ CLOWDER_V1 } /api/collections/{ collection ['id' ]} /datasets"
194194
@@ -203,13 +203,13 @@ def process_collection_descendants(collection, headers_v1,headers_v2, v2_parent_
203203 print (f"Add folder to the dataset" )
204204 folder_name = child ["name" ]
205205 new_folder = create_folder_if_not_exists_or_get (folder_name , None , v2_dataset_id , headers_v2 )
206- process_collection_descendants (child , headers_v1 ,headers_v2 , new_folder ['id' ], 'folder' , v2_dataset_id )
206+ process_collection_descendants (child , headers_v1 , base_headers_v2 , headers_v2 , new_folder ['id' ], 'folder' , v2_dataset_id )
207207 else :
208208 print (f"parent was a folder" )
209209 print (f"Add folder to the dataset" )
210210 folder_name = child ["name" ]
211211 new_folder = create_folder_if_not_exists_or_get (folder_name , v2_parent_id , v2_dataset_id , headers_v2 )
212- process_collection_descendants (child , headers_v1 , headers_v2 , new_folder ['id' ], 'folder' , v2_dataset_id )
212+ process_collection_descendants (child , headers_v1 , base_headers_v2 , headers_v2 , new_folder ['id' ], 'folder' , v2_dataset_id )
213213
214214 for dataset in dataset_json :
215215 if v2_parent_type == "dataset" :
@@ -218,13 +218,13 @@ def process_collection_descendants(collection, headers_v1,headers_v2, v2_parent_
218218 print (f"Now we need to add the sub folders of this dataset" )
219219 # TODO get DATASET FOLDERS HERE FROM v1
220220 process_dataset_folders (dataset , headers_v1 , headers_v2 , new_folder ['id' ], v2_dataset_id )
221- process_dataset_files (dataset , headers_v1 , headers_v2 , 'folder' , new_folder ['id' ], v2_dataset_id )
221+ process_dataset_files (dataset , headers_v1 , base_headers_v2 , 'folder' , new_folder ['id' ], v2_dataset_id )
222222 else :
223223 print (f"Parent is a folder" )
224224 new_folder = create_folder_if_not_exists_or_get (dataset ["name" ], v2_parent_id , v2_dataset_id , headers_v2 )
225225 # TODO GET DATASET FOLDERS HERE FROM v1
226226 process_dataset_folders (dataset , headers_v1 , headers_v2 , new_folder ['id' ], v2_dataset_id )
227- process_dataset_files (dataset , headers_v1 , headers_v2 , 'folder' , new_folder ['id' ], v2_dataset_id )
227+ process_dataset_files (dataset , headers_v1 , base_headers_v2 , 'folder' , new_folder ['id' ], v2_dataset_id )
228228
229229
230230
@@ -265,7 +265,7 @@ def process_dataset_files(dataset, headers_v1, headers_v2, parent_type, parent_i
265265 if folder_v2 ['name' ] == file ['folders' ]['name' ]:
266266 print (f"Upload this file to a folder" )
267267 matching_folder = folder_v2
268- download_and_upload_file_to_folder_id (file , matching_folder , dataset_v2_id , headers_v2 )
268+ download_and_upload_file_to_matching_folder (file , dataset_v2_id , base_headers_v2 , matching_folder )
269269 else :
270270 print (f"This file is not in a folder" )
271271 # TODO upload it to the folder
@@ -277,7 +277,7 @@ def process_dataset_files(dataset, headers_v1, headers_v2, parent_type, parent_i
277277
278278
279279
280- def create_v2_dataset_from_collection (collection , user_v1 , headers_v1 , headers_v2 ):
280+ def create_v2_dataset_from_collection (collection , user_v1 , headers_v1 , headers_v2 , base_headers_v2 ):
281281 # create the dataset
282282 collection_name = collection ["name" ]
283283 collection_description = collection ["description" ]
@@ -295,7 +295,7 @@ def create_v2_dataset_from_collection(collection, user_v1, headers_v1, headers_v
295295 new_dataset_json = response .json ()
296296 v2_dataset_id = new_dataset_json ["id" ]
297297
298- process_collection_descendants (collection , headers_v1 , headers_v2 , new_dataset_json ["id" ], "dataset" , v2_dataset_id )
298+ process_collection_descendants (collection , headers_v1 , base_headers_v2 , headers_v2 , new_dataset_json ["id" ], "dataset" , v2_dataset_id )
299299
300300 return response .json ()["id" ]
301301
@@ -685,20 +685,26 @@ def download_and_upload_file_to_folder_id(file, folder_v2, dataset_v2_id, header
685685 file_exists = os .path .exists (filename )
686686 # Upload the file to Clowder v2
687687 dataset_file_upload_endpoint = f"{ CLOWDER_V2 } /api/v2/datasets/{ dataset_v2_id } /files"
688- if folder_v2 is not None :
689- dataset_file_upload_endpoint += f"?folder_id={ folder_v2 ['id' ]} "
690- response = requests .post (
691- dataset_file_upload_endpoint ,
692- headers = headers_v2 ,
693- files = {"file" : open (filename , "rb" )},
694- )
688+ if folder_v2 :
689+ dataset_file_upload_endpoint += f"Multiple?folder_id={ folder_v2 ['id' ]} "
690+ response = requests .post (
691+ dataset_file_upload_endpoint ,
692+ headers = headers_v2 ,
693+ files = [("files" , (filename , open (filename , "rb" )))],
694+ )
695+ else :
696+ response = requests .post (
697+ dataset_file_upload_endpoint ,
698+ headers = headers_v2 ,
699+ files = {"file" : open (filename , "rb" )},
700+ )
695701
696702 # Clean up the local file after upload
697- # try:
698- # os.remove(filename)
699- # except Exception as e:
700- # print(f"Could not delete locally downloaded file: {filename}")
701- # print(e)
703+ try :
704+ os .remove (filename )
705+ except Exception as e :
706+ print (f"Could not delete locally downloaded file: { filename } " )
707+ print (e )
702708
703709 if response .status_code == 200 :
704710 print (f"Uploaded file: { filename } to dataset { dataset_v2_id } " )
@@ -796,6 +802,56 @@ def download_and_upload_file_to_folder_id(file, folder_v2, dataset_v2_id, header
796802# return None
797803
798804
805+ def download_and_upload_file_to_matching_folder (file , dataset_v2_id , headers_v2 , matching_folder = None ):
806+ """Download a file from Clowder v1 and upload it to Clowder v2."""
807+ filename = file ["filename" ]
808+ file_id = file ["id" ]
809+
810+ # Download the file from Clowder v1
811+ v1_download_url = f"{ CLOWDER_V1 } /api/files/{ file_id } ?superAdmin=true"
812+ print (f"Downloading file: { filename } " )
813+ download_response = requests .get (v1_download_url , headers = clowder_headers_v1 )
814+
815+ with open (filename , "wb" ) as f :
816+ f .write (download_response .content )
817+
818+ # Upload the file to Clowder v2
819+ dataset_file_upload_endpoint = f"{ CLOWDER_V2 } /api/v2/datasets/{ dataset_v2_id } /files"
820+ if matching_folder :
821+ dataset_file_upload_endpoint += f"Multiple?folder_id={ matching_folder ['id' ]} "
822+
823+ # DEBUG: Print the exact request details
824+ print (f"DEBUG: URL: { dataset_file_upload_endpoint } " )
825+ print (f"DEBUG: Headers: { headers_v2 } " )
826+ print (f"DEBUG: Folder ID: { matching_folder ['id' ]} " )
827+
828+ response = requests .post (
829+ dataset_file_upload_endpoint ,
830+ headers = headers_v2 ,
831+ files = [("files" , (filename , open (filename , "rb" )))],
832+ )
833+ else :
834+ response = requests .post (
835+ dataset_file_upload_endpoint ,
836+ headers = headers_v2 ,
837+ files = {"file" : open (filename , "rb" )},
838+ )
839+
840+ # Clean up the local file after upload
841+ try :
842+ os .remove (filename )
843+ except Exception as e :
844+ print (f"Could not delete locally downloaded file: { filename } " )
845+ print (e )
846+
847+ if response .status_code == 200 :
848+ print (f"Uploaded file: { filename } to dataset { dataset_v2_id } " )
849+ return response .json ().get ("id" )
850+ else :
851+ print (f"Failed to upload file: { filename } to dataset { dataset_v2_id } " )
852+
853+ return None
854+
799855
800856def download_and_upload_file (file , all_dataset_folders , dataset_v2_id , headers_v2 ):
801857 """Download a file from Clowder v1 and upload it to Clowder v2."""
@@ -1194,7 +1250,7 @@ def process_user_and_resources_collections(user_v1, USER_MAP, DATASET_MAP, COLLE
11941250 print (f"Got { len (user_v1_collections )} user collections in the top level" )
11951251
11961252 for top_level_col in user_v1_collections :
1197- dataset_v2 = create_v2_dataset_from_collection (top_level_col , user_v1 , clowder_headers_v1 , user_headers_v2 )
1253+ dataset_v2 = create_v2_dataset_from_collection (top_level_col , user_v1 , clowder_headers_v1 , user_headers_v2 , base_headers_v2 )
11981254 print ('did this' )
11991255
12001256 for dataset in user_v1_datasets :
@@ -1245,6 +1301,21 @@ def process_user_and_resources_collections(user_v1, USER_MAP, DATASET_MAP, COLLE
12451301 files_result = files_response .json ()
12461302
12471303 for file in files_result :
1304+ file_folder = file .get ("folders" , None )
1305+ matching_folder = None
1306+ if file_folder :
1307+ matching_folder = next (
1308+ (
1309+ folder
1310+ for folder in all_dataset_folders
1311+ if folder ["name" ] == file_folder ["name" ]
1312+ ),
1313+ None ,
1314+ )
1315+ print ('did we get matching folder?' )
1316+ file_v2_id = download_and_upload_file_to_matching_folder (
1317+ file , dataset_v2_id , base_user_headers_v2 , matching_folder
1318+ )
12481319 file_v2_id = download_and_upload_file (
12491320 file , all_dataset_folders , dataset_v2_id , base_user_headers_v2
12501321 )
@@ -1344,8 +1415,23 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
13441415 dataset_files_endpoint , headers = clowder_headers_v1 , verify = False
13451416 )
13461417 files_result = files_response .json ()
1347-
1418+ # TODO test folde rher
13481419 for file in files_result :
1420+ file_folder = file .get ("folders" , None )
1421+ matching_folder = None
1422+ if file_folder :
1423+ matching_folder = next (
1424+ (
1425+ folder
1426+ for folder in all_dataset_folders
1427+ if folder ["name" ] == file_folder ["name" ]
1428+ ),
1429+ None ,
1430+ )
1431+ print ('did we get matching folder?' )
1432+ file_v2_id = download_and_upload_file_to_matching_folder (
1433+ file , dataset_v2_id , base_user_headers_v2 , matching_folder
1434+ )
13491435 file_v2_id = download_and_upload_file (
13501436 file , all_dataset_folders , dataset_v2_id , base_user_headers_v2
13511437 )
@@ -1419,12 +1505,12 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
14191505 "[Local Account]" in user_v1 ["identityProvider" ]
14201506 and user_v1 ["email" ] != admin_user ["email" ]
14211507 ):
1422- [USER_MAP , DATASET_MAP ] = process_user_and_resources (
1423- user_v1 , USER_MAP , DATASET_MAP
1424- )
1425- # [USER_MAP, DATASET_MAP] = process_user_and_resources_collections(
1426- # user_v1, USER_MAP, DATASET_MAP, COLLECTIONS_MAP
1508+ # [USER_MAP, DATASET_MAP] = process_user_and_resources(
1509+ # user_v1, USER_MAP, DATASET_MAP
14271510 # )
1511+ [USER_MAP , DATASET_MAP ] = process_user_and_resources_collections (
1512+ user_v1 , USER_MAP , DATASET_MAP , COLLECTIONS_MAP
1513+ )
14281514 print (f"Migrated user { user_v1 ['email' ]} and associated resources." )
14291515 else :
14301516 print (f"Skipping user { user_v1 ['email' ]} as it is not a local account." )
0 commit comments