Skip to content

Commit 375103f

Browse files
cam72cambsipocz
authored andcommitted
Add code to support incongruous MAST HST data
1 parent e06d5bb commit 375103f

File tree

1 file changed

+34
-13
lines changed

1 file changed

+34
-13
lines changed

astroquery/mast/core.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import warnings
1313
import json
1414
import time
15+
import string
1516
import os
1617
import re
1718
import keyring
@@ -822,6 +823,7 @@ def __init__(self, *args, **kwargs):
822823
super(ObservationsClass, self).__init__(*args, **kwargs)
823824

824825
self._boto3 = None
826+
self._botocore = None
825827

826828
def list_missions(self):
827829
"""
@@ -1303,7 +1305,9 @@ def enable_s3_hst_dataset(self):
13031305
Requires the boto3 library to function.
13041306
"""
13051307
import boto3
1308+
import botocore
13061309
self._boto3 = boto3
1310+
self._botocore = botocore
13071311

13081312
log.info("Using the S3 HST public dataset")
13091313
log.warning("Your AWS account will be charged for access to the S3 bucket")
@@ -1316,6 +1320,7 @@ def disable_s3_hst_dataset(self):
13161320
Disables downloading HST public files from S3 instead of MAST
13171321
"""
13181322
self._boto3 = None
1323+
self._botocore = None
13191324

13201325
def _download_from_s3(self, dataProduct, localPath, cache=True):
13211326
# The following is a mishmash of BaseQuery._download_file and s3 access through boto
@@ -1328,18 +1333,19 @@ def _download_from_s3(self, dataProduct, localPath, cache=True):
13281333
bkt = s3.Bucket(bkt_name)
13291334

13301335
dataUri = dataProduct['dataURI']
1336+
filename = dataUri.split("/")[-1]
13311337
obs_id = dataProduct['obs_id']
13321338

13331339
obs_id = obs_id.lower()
1334-
1340+
13351341
# This next part is a bit funky. Let me explain why:
13361342
# We have 2 different possible URI schemes for HST:
13371343
# mast:HST/product/obs_id_filename.type (old style)
13381344
# mast:HST/product/obs_id/obs_id_filename.type (new style)
13391345
# The first scheme was developed thinking that the obs_id in the filename
13401346
# would *always* match the actual obs_id folder the file was placed in.
13411347
# Unfortunately this assumption was false.
1342-
# We have been trying to switch to the new uri scheme as it specifies the
1348+
# We have been trying to switch to the new uri scheme as it specifies the
13431349
# obs_id used in the folder path correctly.
13441350
# The cherry on top is that the obs_id in the new style URI is not always correct either!
13451351
# When we are looking up files we have some code which iterates through all of
@@ -1348,23 +1354,38 @@ def _download_from_s3(self, dataProduct, localPath, cache=True):
13481354
# So in conclusion we can't trust the last char obs_id from the file or from the database
13491355
# So with that in mind, hold your nose when reading the following:
13501356

1351-
# magic associations logic per Brian - 0-9/a-e we convert to 0
1352-
magicValues = "123456789abcde"
1353-
if obs_id[-1] in magicValues:
1354-
# We only replace the first occurrence in the folder
1355-
# The filename remains with the original obs_id as part of the name
1356-
new_obs_id = obs_id[:-1] + "0"
1357-
dataUri = dataUri.replace(obs_id, new_obs_id, 1)
1358-
obs_id = new_obs_id
1359-
log.warning("This data product's path may not have been properly identified %s" % dataUri)
1357+
info_lookup = None
1358+
sane_path = os.path.join("hst", "public", obs_id[:4], obs_id, filename)
1359+
try:
1360+
info_lookup = s3_client.head_object(Bucket=bkt_name, Key=sane_path, RequestPayer='requester')
1361+
bucketPath = sane_path
1362+
except self._botocore.exceptions.ClientError as e:
1363+
if e.response['Error']['Code'] != "404":
1364+
raise
1365+
1366+
if info_lookup is None:
1367+
# Unfortunately our file placement logic is anything but sane
1368+
# We put files in folders that don't make sense
1369+
for ch in (string.digits + string.ascii_lowercase):
1370+
# The last char of the obs_folder (observation id) can be any lowercase or numeric char
1371+
insane_obs = obs_id[:-1] + ch
1372+
insane_path = os.path.join("hst", "public", insane_obs[:4], insane_obs, filename)
1373+
1374+
try:
1375+
info_lookup = s3_client.head_object(Bucket=bkt_name, Key=insane_path, RequestPayer='requester')
1376+
bucketPath = insane_path
1377+
break
1378+
except self._botocore.exceptions.ClientError as e:
1379+
if e.response['Error']['Code'] != "404":
1380+
raise
13601381

1361-
bucketPath = "hst/public/" + obs_id[:4] + dataUri.replace("mast:HST/product", "")
1382+
if info_lookup is None:
1383+
raise Exception("Unable to locate file!")
13621384

13631385
# Unfortunately, we can't use the reported file size in the reported product. STScI's backing
13641386
# archive database (CAOM) is frequently out of date and in many cases omits the required information.
13651387
# length = dataProduct["size"]
13661388
# Instead we ask the webserver (in this case S3) what the expected content length is and use that.
1367-
info_lookup = s3_client.head_object(Bucket=bkt_name, Key=bucketPath, RequestPayer='requester')
13681389
length = info_lookup["ContentLength"]
13691390

13701391
if cache and os.path.exists(localPath):

0 commit comments

Comments
 (0)