Skip to content

Commit 3a57b4c

Browse files
authored
Merge pull request #2497 from jdavies-st/mast-cull-duplicate-downloads
Cull duplicate dataURIs for MAST in download_products
2 parents 4c87d7f + 0c96a7e commit 3a57b4c

File tree

4 files changed

+81
-7
lines changed

4 files changed

+81
-7
lines changed

CHANGES.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ linelists.cdms
8989
- Fix issues with the line name parser and the line data parser; the original
9090
implementation was incomplete and upstream was not fully documented. [#2385, #2411]
9191

92+
mast
93+
^^^^
94+
95+
- Cull duplicate downloads for the same dataURI in ``Observations.download_products()``
96+
and duplicate URIs in ``Observations.get_cloud_uris``. [#2497]
97+
9298
oac
9399
^^^
94100

astroquery/exceptions.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77

88
__all__ = ['TimeoutError', 'InvalidQueryError', 'RemoteServiceError',
99
'TableParseError', 'LoginError', 'ResolverError',
10-
'NoResultsWarning', 'LargeQueryWarning', 'InputWarning',
11-
'AuthenticationWarning', 'MaxResultsWarning', 'CorruptDataWarning']
10+
'NoResultsWarning', 'DuplicateResultsWarning', 'LargeQueryWarning',
11+
'InputWarning', 'AuthenticationWarning', 'MaxResultsWarning',
12+
'CorruptDataWarning']
1213

1314

1415
class TimeoutError(Exception):
@@ -67,6 +68,13 @@ class NoResultsWarning(AstropyWarning):
6768
pass
6869

6970

71+
class DuplicateResultsWarning(AstropyWarning):
72+
"""
73+
Astroquery warning class to be issued when a query returns no result.
74+
"""
75+
pass
76+
77+
7078
class LargeQueryWarning(AstropyWarning):
7179
"""
7280
Astroquery warning class to be issued when a query is larger than

astroquery/mast/observations.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import astropy.units as u
2020
import astropy.coordinates as coord
2121

22-
from astropy.table import Table, Row, vstack, MaskedColumn
22+
from astropy.table import Table, Row, unique, vstack, MaskedColumn
2323
from astroquery import log
2424

2525
from astropy.utils import deprecated
@@ -31,7 +31,7 @@
3131
from ..utils import commons, async_to_sync
3232
from ..utils.class_or_instance import class_or_instance
3333
from ..exceptions import (TimeoutError, InvalidQueryError, RemoteServiceError,
34-
ResolverError, MaxResultsWarning,
34+
ResolverError, MaxResultsWarning, DuplicateResultsWarning,
3535
NoResultsWarning, InputWarning, AuthenticationWarning)
3636

3737
from . import conf, utils
@@ -716,6 +716,9 @@ def download_products(self, products, *, download_dir=None,
716716

717717
products = vstack(product_lists)
718718

719+
# Remove duplicate products
720+
products = self._remove_duplicate_products(products)
721+
719722
# apply filters
720723
products = self.filter_products(products, mrp_only=mrp_only, **filters)
721724

@@ -767,6 +770,9 @@ def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False):
767770
raise RemoteServiceError('Please enable anonymous cloud access by calling `enable_cloud_dataset` method. '
768771
'See MAST Labs documentation for an example: https://mast-labs.stsci.io/#example-data-access-with-astroquery-observations')
769772

773+
# Remove duplicate products
774+
data_products = self._remove_duplicate_products(data_products)
775+
770776
return self._cloud_connection.get_cloud_uri_list(data_products, include_bucket, full_url)
771777

772778
def get_cloud_uri(self, data_product, *, include_bucket=True, full_url=False):
@@ -802,6 +808,30 @@ def get_cloud_uri(self, data_product, *, include_bucket=True, full_url=False):
802808
# Query for product URIs
803809
return self._cloud_connection.get_cloud_uri(data_product, include_bucket, full_url)
804810

811+
def _remove_duplicate_products(self, data_products):
812+
"""
813+
Removes duplicate data products that have the same dataURI.
814+
815+
Parameters
816+
----------
817+
data_products : `~astropy.table.Table`
818+
Table containing products to be checked for duplicates.
819+
820+
Returns
821+
-------
822+
unique_products : `~astropy.table.Table`
823+
Table containing products with unique dataURIs.
824+
825+
"""
826+
number = len(data_products)
827+
unique_products = unique(data_products, keys="dataURI")
828+
number_unique = len(unique_products)
829+
if number_unique < number:
830+
warnings.warn(f"{number - number_unique} of {number} products were duplicates."
831+
f"Only downloading {number_unique} unique product(s).", DuplicateResultsWarning)
832+
833+
return unique_products
834+
805835

806836
@async_to_sync
807837
class MastClass(MastQueryWithLogin):

astroquery/mast/tests/test_mast_remote.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
from astroquery import mast
1515

1616
from ..utils import ResolverError
17-
from ...exceptions import InvalidQueryError, MaxResultsWarning, NoResultsWarning, RemoteServiceError
17+
from ...exceptions import (InvalidQueryError, MaxResultsWarning, NoResultsWarning,
18+
DuplicateResultsWarning, RemoteServiceError)
1819

1920

2021
OBSID = '1647157'
@@ -274,7 +275,7 @@ def test_observations_download_products(self, tmpdir):
274275
assert os.path.isfile(row['Local Path'])
275276

276277
# just get the curl script
277-
result = mast.Observations.download_products(test_obs[0]["obsid"],
278+
result = mast.Observations.download_products(test_obs_id[0]["obsid"],
278279
download_dir=str(tmpdir),
279280
curl_flag=True,
280281
productType=["SCIENCE"],
@@ -283,12 +284,41 @@ def test_observations_download_products(self, tmpdir):
283284
assert os.path.isfile(result['Local Path'][0])
284285

285286
# check for row input
286-
result1 = mast.Observations.get_product_list(test_obs[0]["obsid"])
287+
result1 = mast.Observations.get_product_list(test_obs_id[0]["obsid"])
287288
result2 = mast.Observations.download_products(result1[0])
288289
assert isinstance(result2, Table)
289290
assert os.path.isfile(result2['Local Path'][0])
290291
assert len(result2) == 1
291292

293+
def test_observations_download_products_no_duplicates(tmpdir):
294+
295+
# Pull products for a JWST NIRSpec MSA observation with 6 known
296+
# duplicates of the MSA configuration file, propID=2736
297+
products = mast.Observations.get_product_list("87602009")
298+
299+
# Filter out everything but the MSA config file
300+
mask = np.char.find(products["dataURI"], "_msa.fits") != -1
301+
products = products[mask]
302+
303+
assert len(products) == 6
304+
305+
# Download the product
306+
with pytest.warns(DuplicateResultsWarning):
307+
manifest = mast.Observations.download_products(products,
308+
download_dir=str(tmpdir))
309+
310+
# Check that it downloads the MSA config file only once
311+
assert len(manifest) == 1
312+
313+
# enable access to public AWS S3 bucket
314+
mast.Observations.enable_cloud_dataset()
315+
316+
# Check duplicate cloud URIs as well
317+
with pytest.warns(DuplicateResultsWarning):
318+
uris = mast.Observations.get_cloud_uris(products)
319+
320+
assert len(uris) == 1
321+
292322
def test_observations_download_file(self, tmpdir):
293323

294324
# enabling cloud connection

0 commit comments

Comments
 (0)