@@ -158,7 +158,9 @@ def _replace_aws_with_gcp_buckets(dataframe, column_name):
158158 replacements = {
159159 r"s3://idc-open-data-two/" : r"s3://idc-open-idc1/" ,
160160 r"s3://idc-open-data-cr/" : r"s3://idc-open-cr/" ,
161- r"s3://idc-open-data/" : r"s3://public-datasets-idc/" ,
161+ # as of IDC v20, we use a new bucket that has the same name as AWS
162+ # for `idc-open-data` - no need to replace
163+ # r"s3://idc-open-data/": r"s3://public-datasets-idc/",
162164 }
163165
164166 # Function to apply replacements
@@ -310,6 +312,15 @@ def _check_create_directory(download_dir):
310312
311313 return str (download_dir .resolve ())
312314
315+ def _check_disk_size_and_warn (self , download_dir , disk_size_needed ):
316+ disk_free_space_MB = psutil .disk_usage (download_dir ).free / (1000 * 1000 )
317+ logger .info ("Disk size needed: " + self ._format_size (disk_size_needed ))
318+ logger .info ("Disk size available: " + self ._format_size (disk_free_space_MB ))
319+ if disk_free_space_MB < disk_size_needed :
320+ logger .error ("Not enough free space on disk to download the files." )
321+ return False
322+ return True
323+
313324 def fetch_index (self , index_name ) -> None :
314325 """
315326 Downloads requested index and adds this index joined with the main index as respective class attribute.
@@ -977,44 +988,57 @@ def _validate_update_manifest_and_get_download_size(
977988 else :
978989 logger .info ("All of the identifiers from manifest have been resolved!" )
979990
991+ # `idc-open-data` bucket is present in both AWS and GCP, this is why we skip checking endpoint
992+ # for the URLs that contain `idc-open-data`
993+ provider_specific_urls = merged_df [
994+ ~ merged_df ["s3_url" ].str .contains ("/idc-open-data/" )
995+ ]
996+
980997 if validate_manifest :
981998 # Check if there is more than one endpoint
982- if len (merged_df ["endpoint" ].unique ()) > 1 :
999+ if len (provider_specific_urls ["endpoint" ].unique ()) > 1 :
1000+ logger .error ("A mix of endpoint s3_urls encountered!" )
1001+ for endpoint in merged_df ["endpoint" ].unique ():
1002+ sample_s3_url = merged_df [
1003+ merged_df ["endpoint" ] == endpoint
1004+ ].s3_url .values [0 ]
1005+ logger .error (f" Endpoint { endpoint } s3_url { sample_s3_url } " )
9831006 raise ValueError (
9841007 "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
9851008 )
986-
987- if (
988- len (merged_df ["endpoint" ].unique ()) == 1
989- and merged_df ["endpoint" ].values [0 ] == "aws"
990- ):
1009+ elif provider_specific_urls .empty :
1010+ # if all URLs are from idc-open-data, default to AWS
9911011 endpoint_to_use = aws_endpoint_url
992-
993- if (
994- len (merged_df ["endpoint" ].unique ()) == 1
995- and merged_df ["endpoint" ].values [0 ] == "unknown"
996- ):
997- cmd = [
998- self .s5cmdPath ,
999- "--no-sign-request" ,
1000- "--endpoint-url" ,
1001- gcp_endpoint_url ,
1002- "ls" ,
1003- merged_df .s3_url .values [0 ],
1004- ]
1005- process = subprocess .run (
1006- cmd , capture_output = True , text = True , check = False
1007- )
1008- if process .stderr and process .stdout .startswith ("ERROR" ):
1009- logger .debug (
1010- "Folder not available in GCP. Manifest appears to be invalid."
1012+ else : # provider_specific_urls["endpoint"].unique()) == 1
1013+ if provider_specific_urls ["endpoint" ].values [0 ] == "aws" :
1014+ logging .debug ("Detected AWS as the endpoint to use" )
1015+ endpoint_to_use = aws_endpoint_url
1016+ else : # unknown / gcp
1017+ logging .debug ("Will use GCS endpoint" )
1018+ cmd = [
1019+ self .s5cmdPath ,
1020+ "--no-sign-request" ,
1021+ "--endpoint-url" ,
1022+ gcp_endpoint_url ,
1023+ "ls" ,
1024+ merged_df .s3_url .values [0 ],
1025+ ]
1026+ process = subprocess .run (
1027+ cmd , capture_output = True , text = True , check = False
10111028 )
1012- if validate_manifest :
1013- raise ValueError
1014- else :
1015- endpoint_to_use = gcp_endpoint_url
1029+ if process .stderr and process .stdout .startswith ("ERROR" ):
1030+ logger .debug (
1031+ "Folder not available in GCP. Manifest appears to be invalid."
1032+ )
1033+ if validate_manifest :
1034+ raise ValueError
1035+ else :
1036+ endpoint_to_use = gcp_endpoint_url
10161037
1017- elif merged_df ["endpoint" ].values [0 ] == "aws" :
1038+ elif (
1039+ provider_specific_urls .empty
1040+ or provider_specific_urls ["endpoint" ].values [0 ] == "aws"
1041+ ):
10181042 endpoint_to_use = aws_endpoint_url
10191043 else :
10201044 # TODO: here we assume that the endpoint is GCP; we could check at least the first URL to be sure,
@@ -1417,7 +1441,7 @@ def _s5cmd_run(
14171441
14181442 # fedorov: did consider-using-with, and decided against it to keep the code more readable
14191443 stderr_log_file = tempfile .NamedTemporaryFile (delete = False ) # pylint: disable=consider-using-with
1420-
1444+ logging . debug ( "Running download command: " + str ( cmd ))
14211445 with subprocess .Popen (
14221446 cmd ,
14231447 stdout = stdout ,
@@ -1522,7 +1546,8 @@ def download_from_manifest(
15221546 )
15231547
15241548 total_size_rounded = round (total_size , 2 )
1525- logger .info ("Total size: " + self ._format_size (total_size_rounded ))
1549+ if not self ._check_disk_size_and_warn (downloadDir , total_size ):
1550+ return
15261551
15271552 self ._s5cmd_run (
15281553 endpoint_to_use = endpoint_to_use ,
@@ -1760,29 +1785,11 @@ def download_from_selection(
17601785 total_size = round (result_df ["series_size_MB" ].sum (), 2 )
17611786 else :
17621787 total_size_bytes = round (result_df ["instance_size" ].sum (), 2 )
1763- logger .info (
1764- "Total size of files to download: "
1765- + self ._format_size (total_size_bytes , size_in_bytes = True )
1766- )
17671788 total_size = total_size_bytes / (10 ** 6 )
17681789
1769- disk_free_space_MB = psutil .disk_usage (downloadDir ).free / (1000 * 1000 )
1770- if disk_free_space_MB < total_size :
1771- logger .error ("Not enough free space on disk to download the files." )
1772- logger .error (
1773- "Total size of files to download: " + self ._format_size (total_size )
1774- )
1775- logger .error (
1776- "Total free space on disk: " + self ._format_size (disk_free_space_MB )
1777- )
1790+ if not self ._check_disk_size_and_warn (downloadDir , total_size ):
17781791 return
17791792
1780- logger .info (
1781- "Total free space on disk: "
1782- + str (psutil .disk_usage (downloadDir ).free / (1000 * 1000 * 1000 ))
1783- + " GB"
1784- )
1785-
17861793 if dry_run :
17871794 logger .info (
17881795 "Dry run. Not downloading files. Rerun with dry_run=False to download the files."
0 commit comments