Skip to content

Commit 326bcc7

Browse files
authored
BUG: fix download of GCS manifest based content (#156)
Due to the switch from `public-datasets-idc` to `idc-open-data` bucket for GCS, the logic for detecting provider from s3_url had to be updated. In the situations where manifest contains a mix of URLs between `idc-open-data` and other buckets within the same provider, the prior logic would fail since `idc-open-data` URLs would match those in AWS, but other URLs would not, and the code would incorrectly conclude that the manifest contains a mix of URLs across more than one provider. In the updated code, URLs that refer to `idc-open-data` are ignored, and the assessment of whether there is more than one endpoint is done for the remaining URLs. Also, while mapping manifest URLs from AWS to GCP, mapping of `idc-open-data` is not done, `public-datasets-idc` has been replaced with `idc-open-data` for GCS in v20, and the former will eventually be deprecated and removed, although for now it is possible to download from that bucket.
1 parent 1f316e1 commit 326bcc7

File tree

1 file changed

+59
-52
lines changed

1 file changed

+59
-52
lines changed

idc_index/index.py

Lines changed: 59 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,9 @@ def _replace_aws_with_gcp_buckets(dataframe, column_name):
158158
replacements = {
159159
r"s3://idc-open-data-two/": r"s3://idc-open-idc1/",
160160
r"s3://idc-open-data-cr/": r"s3://idc-open-cr/",
161-
r"s3://idc-open-data/": r"s3://public-datasets-idc/",
161+
# as of IDC v20, we use a new bucket that has the same name as AWS
162+
# for `idc-open-data` - no need to replace
163+
# r"s3://idc-open-data/": r"s3://public-datasets-idc/",
162164
}
163165

164166
# Function to apply replacements
@@ -310,6 +312,15 @@ def _check_create_directory(download_dir):
310312

311313
return str(download_dir.resolve())
312314

315+
def _check_disk_size_and_warn(self, download_dir, disk_size_needed):
316+
disk_free_space_MB = psutil.disk_usage(download_dir).free / (1000 * 1000)
317+
logger.info("Disk size needed: " + self._format_size(disk_size_needed))
318+
logger.info("Disk size available: " + self._format_size(disk_free_space_MB))
319+
if disk_free_space_MB < disk_size_needed:
320+
logger.error("Not enough free space on disk to download the files.")
321+
return False
322+
return True
323+
313324
def fetch_index(self, index_name) -> None:
314325
"""
315326
Downloads requested index and adds this index joined with the main index as respective class attribute.
@@ -977,44 +988,57 @@ def _validate_update_manifest_and_get_download_size(
977988
else:
978989
logger.info("All of the identifiers from manifest have been resolved!")
979990

991+
# `idc-open-data` bucket is present in both AWS and GCP, this is why we skip checking endpoint
992+
# for the URLs that contain `idc-open-data`
993+
provider_specific_urls = merged_df[
994+
~merged_df["s3_url"].str.contains("/idc-open-data/")
995+
]
996+
980997
if validate_manifest:
981998
# Check if there is more than one endpoint
982-
if len(merged_df["endpoint"].unique()) > 1:
999+
if len(provider_specific_urls["endpoint"].unique()) > 1:
1000+
logger.error("A mix of endpoint s3_urls encountered!")
1001+
for endpoint in merged_df["endpoint"].unique():
1002+
sample_s3_url = merged_df[
1003+
merged_df["endpoint"] == endpoint
1004+
].s3_url.values[0]
1005+
logger.error(f" Endpoint {endpoint} s3_url {sample_s3_url}")
9831006
raise ValueError(
9841007
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
9851008
)
986-
987-
if (
988-
len(merged_df["endpoint"].unique()) == 1
989-
and merged_df["endpoint"].values[0] == "aws"
990-
):
1009+
elif provider_specific_urls.empty:
1010+
# if all URLs are from idc-open-data, default to AWS
9911011
endpoint_to_use = aws_endpoint_url
992-
993-
if (
994-
len(merged_df["endpoint"].unique()) == 1
995-
and merged_df["endpoint"].values[0] == "unknown"
996-
):
997-
cmd = [
998-
self.s5cmdPath,
999-
"--no-sign-request",
1000-
"--endpoint-url",
1001-
gcp_endpoint_url,
1002-
"ls",
1003-
merged_df.s3_url.values[0],
1004-
]
1005-
process = subprocess.run(
1006-
cmd, capture_output=True, text=True, check=False
1007-
)
1008-
if process.stderr and process.stdout.startswith("ERROR"):
1009-
logger.debug(
1010-
"Folder not available in GCP. Manifest appears to be invalid."
1012+
else: # provider_specific_urls["endpoint"].unique()) == 1
1013+
if provider_specific_urls["endpoint"].values[0] == "aws":
1014+
logging.debug("Detected AWS as the endpoint to use")
1015+
endpoint_to_use = aws_endpoint_url
1016+
else: # unknown / gcp
1017+
logging.debug("Will use GCS endpoint")
1018+
cmd = [
1019+
self.s5cmdPath,
1020+
"--no-sign-request",
1021+
"--endpoint-url",
1022+
gcp_endpoint_url,
1023+
"ls",
1024+
merged_df.s3_url.values[0],
1025+
]
1026+
process = subprocess.run(
1027+
cmd, capture_output=True, text=True, check=False
10111028
)
1012-
if validate_manifest:
1013-
raise ValueError
1014-
else:
1015-
endpoint_to_use = gcp_endpoint_url
1029+
if process.stderr and process.stdout.startswith("ERROR"):
1030+
logger.debug(
1031+
"Folder not available in GCP. Manifest appears to be invalid."
1032+
)
1033+
if validate_manifest:
1034+
raise ValueError
1035+
else:
1036+
endpoint_to_use = gcp_endpoint_url
10161037

1017-
elif merged_df["endpoint"].values[0] == "aws":
1038+
elif (
1039+
provider_specific_urls.empty
1040+
or provider_specific_urls["endpoint"].values[0] == "aws"
1041+
):
10181042
endpoint_to_use = aws_endpoint_url
10191043
else:
10201044
# TODO: here we assume that the endpoint is GCP; we could check at least the first URL to be sure,
@@ -1417,7 +1441,7 @@ def _s5cmd_run(
14171441

14181442
# fedorov: did consider-using-with, and decided against it to keep the code more readable
14191443
stderr_log_file = tempfile.NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with
1420-
1444+
logging.debug("Running download command: " + str(cmd))
14211445
with subprocess.Popen(
14221446
cmd,
14231447
stdout=stdout,
@@ -1522,7 +1546,8 @@ def download_from_manifest(
15221546
)
15231547

15241548
total_size_rounded = round(total_size, 2)
1525-
logger.info("Total size: " + self._format_size(total_size_rounded))
1549+
if not self._check_disk_size_and_warn(downloadDir, total_size):
1550+
return
15261551

15271552
self._s5cmd_run(
15281553
endpoint_to_use=endpoint_to_use,
@@ -1760,29 +1785,11 @@ def download_from_selection(
17601785
total_size = round(result_df["series_size_MB"].sum(), 2)
17611786
else:
17621787
total_size_bytes = round(result_df["instance_size"].sum(), 2)
1763-
logger.info(
1764-
"Total size of files to download: "
1765-
+ self._format_size(total_size_bytes, size_in_bytes=True)
1766-
)
17671788
total_size = total_size_bytes / (10**6)
17681789

1769-
disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000)
1770-
if disk_free_space_MB < total_size:
1771-
logger.error("Not enough free space on disk to download the files.")
1772-
logger.error(
1773-
"Total size of files to download: " + self._format_size(total_size)
1774-
)
1775-
logger.error(
1776-
"Total free space on disk: " + self._format_size(disk_free_space_MB)
1777-
)
1790+
if not self._check_disk_size_and_warn(downloadDir, total_size):
17781791
return
17791792

1780-
logger.info(
1781-
"Total free space on disk: "
1782-
+ str(psutil.disk_usage(downloadDir).free / (1000 * 1000 * 1000))
1783-
+ " GB"
1784-
)
1785-
17861793
if dry_run:
17871794
logger.info(
17881795
"Dry run. Not downloading files. Rerun with dry_run=False to download the files."

0 commit comments

Comments
 (0)