7373from pymongo import DESCENDING
7474from rocrate .model .person import Person
7575from rocrate .rocrate import ROCrate
76+ from starlette .concurrency import run_in_threadpool
7677
7778router = APIRouter ()
7879security = HTTPBearer ()
@@ -1192,16 +1193,26 @@ async def download_dataset(
11921193 bag_info_path = os .path .join (current_temp_dir , "bag-info.txt" )
11931194 tagmanifest_path = os .path .join (current_temp_dir , "tagmanifest-md5.txt" )
11941195
1195- with open (manifest_path , "w" ) as f :
1196- pass # Create empty file so no errors later if the dataset is empty
1197-
1198- with open (bagit_path , "w" ) as f :
1199- f .write ("Bag-Software-Agent: clowder.ncsa.illinois.edu" + "\n " )
1200- f .write ("Bagging-Date: " + str (datetime .datetime .now ()) + "\n " )
1196+ await run_in_threadpool (lambda : open (manifest_path , "w" ).close ())
1197+ await run_in_threadpool (lambda : open (manifest_path , "w" ).close ())
1198+ await run_in_threadpool (
1199+ lambda : open (bagit_path , "w" ).write (
1200+ "Bag-Software-Agent: clowder.ncsa.illinois.edu"
1201+ + "\n "
1202+ + "Bagging-Date: "
1203+ + str (datetime .datetime .now ())
1204+ + "\n "
1205+ )
1206+ )
12011207
1202- with open (bag_info_path , "w" ) as f :
1203- f .write ("BagIt-Version: 0.97" + "\n " )
1204- f .write ("Tag-File-Character-Encoding: UTF-8" + "\n " )
1208+ await run_in_threadpool (
1209+ lambda : open (bag_info_path , "w" ).write (
1210+ "BagIt-Version: 0.97"
1211+ + "\n "
1212+ + "Tag-File-Character-Encoding: UTF-8"
1213+ + "\n "
1214+ )
1215+ )
12051216
12061217 # Write dataset metadata if found
12071218 metadata = await MetadataDB .find (
@@ -1214,6 +1225,10 @@ async def download_dataset(
12141225 metadata_content = json_util .dumps (metadata )
12151226 with open (datasetmetadata_path , "w" ) as f :
12161227 f .write (metadata_content )
1228+ await run_in_threadpool (
1229+ lambda : open (datasetmetadata_path , "w" ).write (metadata_content )
1230+ )
1231+
12171232 crate .add_file (
12181233 datasetmetadata_path ,
12191234 dest_path = "metadata/_dataset_metadata.json" ,
@@ -1236,16 +1251,20 @@ async def download_dataset(
12361251 hierarchy = await _get_folder_hierarchy (file .folder_id , "" )
12371252 dest_folder = os .path .join (current_temp_dir , hierarchy .lstrip ("/" ))
12381253 if not os .path .isdir (dest_folder ):
1239- os .makedirs ( dest_folder , exist_ok = True )
1254+ await run_in_threadpool ( os .makedirs , dest_folder , exist_ok = True )
12401255 file_name = hierarchy + file_name
12411256 current_file_path = os .path .join (current_temp_dir , file_name .lstrip ("/" ))
12421257
12431258 content = fs .get_object (settings .MINIO_BUCKET_NAME , bytes_file_id )
12441259 file_md5_hash = hashlib .md5 (content .data ).hexdigest ()
1245- with open (current_file_path , "wb" ) as f1 :
1246- f1 .write (content .data )
1247- with open (manifest_path , "a" ) as mpf :
1248- mpf .write (file_md5_hash + " " + file_name + "\n " )
1260+ await run_in_threadpool (
1261+ lambda : open (current_file_path , "wb" ).write (content .data )
1262+ )
1263+ await run_in_threadpool (
1264+ lambda : open (manifest_path , "a" ).write (
1265+ file_md5_hash + " " + file_name + "\n "
1266+ )
1267+ )
12491268 crate .add_file (
12501269 current_file_path ,
12511270 dest_path = "data/" + file_name ,
@@ -1266,23 +1285,43 @@ async def download_dataset(
12661285 current_temp_dir , metadata_filename
12671286 )
12681287 metadata_content = json_util .dumps (metadata )
1269- with open (metadata_filename_temp_path , "w" ) as f :
1270- f .write (metadata_content )
1288+ await run_in_threadpool (
1289+ lambda : open (metadata_filename_temp_path , "w" ).write (
1290+ metadata_content
1291+ )
1292+ )
12711293 crate .add_file (
12721294 metadata_filename_temp_path ,
12731295 dest_path = "metadata/" + metadata_filename ,
12741296 properties = {"name" : metadata_filename },
12751297 )
12761298
12771299 bag_size_kb = bag_size / 1024
1278-
1279- with open (bagit_path , "a" ) as f :
1280- f .write ("Bag-Size: " + str (bag_size_kb ) + " kB" + "\n " )
1281- f .write ("Payload-Oxum: " + str (bag_size ) + "." + str (file_count ) + "\n " )
1282- f .write ("Internal-Sender-Identifier: " + dataset_id + "\n " )
1283- f .write ("Internal-Sender-Description: " + dataset .description + "\n " )
1284- f .write ("Contact-Name: " + user_full_name + "\n " )
1285- f .write ("Contact-Email: " + user .email + "\n " )
1300+ await run_in_threadpool (
1301+ lambda : open (bagit_path , "a" ).write (
1302+ "Bag-Size: "
1303+ + str (bag_size_kb )
1304+ + " kB"
1305+ + "\n "
1306+ + "Payload-Oxum: "
1307+ + str (bag_size )
1308+ + "."
1309+ + str (file_count )
1310+ + "\n "
1311+ + "Internal-Sender-Identifier: "
1312+ + dataset_id
1313+ + "\n "
1314+ + "Internal-Sender-Description: "
1315+ + dataset .description
1316+ + "\n "
1317+ + "Contact-Name: "
1318+ + user_full_name
1319+ + "\n "
1320+ + "Contact-Email: "
1321+ + user .email
1322+ + "\n "
1323+ )
1324+ )
12861325 crate .add_file (
12871326 bagit_path , dest_path = "bagit.txt" , properties = {"name" : "bagit.txt" }
12881327 )
@@ -1296,14 +1335,33 @@ async def download_dataset(
12961335 )
12971336
12981337 # Generate tag manifest file
1299- manifest_md5_hash = hashlib .md5 (open (manifest_path , "rb" ).read ()).hexdigest ()
1300- bagit_md5_hash = hashlib .md5 (open (bagit_path , "rb" ).read ()).hexdigest ()
1301- bag_info_md5_hash = hashlib .md5 (open (bag_info_path , "rb" ).read ()).hexdigest ()
1302-
1303- with open (tagmanifest_path , "w" ) as f :
1304- f .write (bagit_md5_hash + " " + "bagit.txt" + "\n " )
1305- f .write (manifest_md5_hash + " " + "manifest-md5.txt" + "\n " )
1306- f .write (bag_info_md5_hash + " " + "bag-info.txt" + "\n " )
1338+ manifest_md5_hash = await run_in_threadpool (
1339+ lambda : hashlib .md5 (open (manifest_path , "rb" ).read ()).hexdigest ()
1340+ )
1341+ bagit_md5_hash = await run_in_threadpool (
1342+ lambda : hashlib .md5 (open (bagit_path , "rb" ).read ()).hexdigest ()
1343+ )
1344+ bag_info_md5_hash = await run_in_threadpool (
1345+ lambda : hashlib .md5 (open (bag_info_path , "rb" ).read ()).hexdigest ()
1346+ )
1347+
1348+ await run_in_threadpool (
1349+ lambda : open (tagmanifest_path , "w" ).write (
1350+ bagit_md5_hash
1351+ + " "
1352+ + "bagit.txt"
1353+ + "\n "
1354+ + manifest_md5_hash
1355+ + " "
1356+ + "manifest-md5.txt"
1357+ + "\n "
1358+ + bag_info_md5_hash
1359+ + " "
1360+ + "bag-info.txt"
1361+ + "\n "
1362+ )
1363+ )
1364+
13071365 crate .add_file (
13081366 tagmanifest_path ,
13091367 dest_path = "tagmanifest-md5.txt" ,
@@ -1317,13 +1375,16 @@ async def download_dataset(
13171375 )
13181376 zip_name = dataset .name + version_name + ".zip"
13191377 path_to_zip = os .path .join (current_temp_dir , zip_name )
1320- crate .write_zip (path_to_zip )
1321- f = open (path_to_zip , "rb" , buffering = 0 )
1322- zip_bytes = f .read ()
1378+
1379+ await run_in_threadpool (crate .write_zip , path_to_zip ) # takes the most time?
1380+
1381+ f = await run_in_threadpool (open , path_to_zip , "rb" , 0 )
1382+ zip_bytes = await run_in_threadpool (f .read )
13231383 stream = io .BytesIO (zip_bytes )
1324- f .close ()
1384+ await run_in_threadpool (f .close )
1385+
13251386 try :
1326- shutil .rmtree ( current_temp_dir )
1387+ await run_in_threadpool ( shutil .rmtree , current_temp_dir )
13271388 except Exception as e :
13281389 print ("could not delete file" )
13291390 print (e )
0 commit comments