Skip to content

Commit 454f66d

Browse files
snbiancobsipocz
authored andcommitted
speed up get_cloud_uris()
1 parent 6fe4c2b commit 454f66d

File tree

4 files changed

+81
-39
lines changed

4 files changed

+81
-39
lines changed

CHANGES.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ mast
171171

172172
- Modify ``mast.Observations.get_cloud_uris`` to also accept query criteria and data product filters. [#3064]
173173

174+
- Increased the speed of ``mast.Observations.get_cloud_uris`` by obtaining multiple
175+
URIs from MAST at once. [#3064]
176+
174177

175178
0.4.7 (2024-03-08)
176179
==================

astroquery/mast/cloud.py

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from astropy.utils.console import ProgressBarOrSpinner
1515
from astropy.utils.exceptions import AstropyDeprecationWarning
1616

17-
from ..exceptions import NoResultsWarning, InvalidQueryError
17+
from ..exceptions import NoResultsWarning
1818

1919
from . import utils
2020

@@ -109,32 +109,14 @@ def get_cloud_uri(self, data_product, include_bucket=True, full_url=False):
109109
found in the cloud, None is returned.
110110
"""
111111

112-
s3_client = self.boto3.client('s3', config=self.config)
113-
114-
path = utils.mast_relative_path(data_product["dataURI"])
115-
if path is None:
116-
raise InvalidQueryError("Malformed data uri {}".format(data_product['dataURI']))
112+
uri_list = self.get_cloud_uri_list(data_product, include_bucket=include_bucket, full_url=full_url)
117113

118-
if 'galex' in path:
119-
path = path.lstrip("/mast/")
120-
elif '/ps1/' in path:
121-
path = path.replace("/ps1/", "panstarrs/ps1/public/")
114+
# Making sure we got at least 1 URI from the query above.
115+
if uri_list[0] is None:
116+
warnings.warn("Unable to locate file {}.".format(data_product), NoResultsWarning)
122117
else:
123-
path = path.lstrip("/")
124-
125-
try:
126-
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
127-
if include_bucket:
128-
path = "s3://{}/{}".format(self.pubdata_bucket, path)
129-
elif full_url:
130-
path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path)
131-
return path
132-
except self.botocore.exceptions.ClientError as e:
133-
if e.response['Error']['Code'] != "404":
134-
raise
135-
136-
warnings.warn("Unable to locate file {}.".format(data_product['productFilename']), NoResultsWarning)
137-
return None
118+
# Output from ``get_cloud_uri_list`` is always a list even when it's only 1 URI
119+
return uri_list[0]
138120

139121
def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False):
140122
"""
@@ -158,8 +140,33 @@ def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False)
158140
List of URIs generated from the data products, list way contain entries that are None
159141
if data_products includes products not found in the cloud.
160142
"""
143+
s3_client = self.boto3.client('s3', config=self.config)
161144

162-
return [self.get_cloud_uri(product, include_bucket, full_url) for product in data_products]
145+
paths = utils.mast_relative_path(data_products["dataURI"])
146+
if isinstance(paths, str): # Handle the case where only one product was requested
147+
paths = [paths]
148+
149+
uri_list = []
150+
for path in paths:
151+
if path is None:
152+
uri_list.append(None)
153+
else:
154+
try:
155+
# Use `head_object` to verify that the product is available on S3 (not all products are)
156+
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
157+
if include_bucket:
158+
s3_path = "s3://{}/{}".format(self.pubdata_bucket, path)
159+
uri_list.append(s3_path)
160+
elif full_url:
161+
path = "http://s3.amazonaws.com/{}/{}".format(self._pubdata_bucket, path)
162+
uri_list.append(path)
163+
except self.botocore.exceptions.ClientError as e:
164+
if e.response['Error']['Code'] != "404":
165+
raise
166+
warnings.warn("Unable to locate file {}.".format(path), NoResultsWarning)
167+
uri_list.append(None)
168+
169+
return uri_list
163170

164171
def download_file(self, data_product, local_path, cache=True, verbose=True):
165172
"""

astroquery/mast/observations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,7 @@ def get_cloud_uris(self, data_products=None, *, include_bucket=True, full_url=Fa
784784
----------
785785
data_products : `~astropy.table.Table`
786786
Table containing products to be converted into cloud data uris. If provided, this will supercede
787-
page_size, page, or any arguments passed in as **criteria.
787+
page_size, page, or any keyword arguments passed in as criteria.
788788
include_bucket : bool
789789
Default True. When False, returns the path of the file relative to the
790790
top level cloud storage location.

astroquery/mast/utils.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -158,22 +158,54 @@ def parse_input_location(coordinates=None, objectname=None):
158158

159159
def mast_relative_path(mast_uri):
160160
"""
161-
Given a MAST dataURI, return the associated relative path.
161+
Given one or more MAST dataURI(s), return the associated relative path(s).
162162
163163
Parameters
164164
----------
165-
mast_uri : str
166-
The MAST uri.
165+
mast_uri : str, list of str
166+
The MAST uri(s).
167167
168168
Returns
169169
-------
170-
response : str
171-
The associated relative path.
170+
response : str, list of str
171+
The associated relative path(s).
172172
"""
173-
174-
response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
175-
{"uri": mast_uri})
176-
result = response.json()
177-
uri_result = result.get(mast_uri)
178-
179-
return uri_result["path"]
173+
if isinstance(mast_uri, str):
174+
uri_list = [("uri", mast_uri)]
175+
else: # mast_uri parameter is a list
176+
uri_list = [("uri", uri) for uri in mast_uri]
177+
178+
# Split the list into chunks of 50 URIs; this is necessary
179+
# to avoid "414 Client Error: Request-URI Too Large".
180+
uri_list_chunks = list(_split_list_into_chunks(uri_list, chunk_size=50))
181+
182+
result = []
183+
for chunk in uri_list_chunks:
184+
response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
185+
{"uri": chunk})
186+
json_response = response.json()
187+
188+
for uri in chunk:
189+
# Chunk is a list of tuples where the tuple is
190+
# ("uri", "/path/to/product")
191+
# so we index for path (index=1)
192+
path = json_response.get(uri[1])["path"]
193+
if 'galex' in path:
194+
path = path.lstrip("/mast/")
195+
elif '/ps1/' in path:
196+
path = path.replace("/ps1/", "panstarrs/ps1/public/")
197+
else:
198+
path = path.lstrip("/")
199+
result.append(path)
200+
201+
# If the input was a single URI string, we return a single string
202+
if isinstance(mast_uri, str):
203+
return result[0]
204+
# Else, return a list of paths
205+
return result
206+
207+
208+
def _split_list_into_chunks(input_list, chunk_size):
209+
"""Helper function for `mast_relative_path`."""
210+
for idx in range(0, len(input_list), chunk_size):
211+
yield input_list[idx:idx + chunk_size]

0 commit comments

Comments
 (0)