Skip to content

Commit 97f6f30

Browse files
authored
Merge pull request #2263 from keflavich/verify_only_alma
ALMA: add option to just validate data
2 parents d17805e + 438e3af commit 97f6f30

File tree

5 files changed

+113
-10
lines changed

5 files changed

+113
-10
lines changed

CHANGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ esa.hubble
1010

1111
Service fixes and enhancements
1212
------------------------------
13+
alma
14+
^^^^
15+
16+
- Added ``verify_only`` option to check if data downloaded with correct file size [#2263]
17+
1318
esa.hubble
1419
^^^^^^^^^^
1520

astroquery/alma/core.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
_gen_science_sql, _gen_spec_res_sql, ALMA_DATE_FORMAT
3232
from . import conf, auth_urls
3333
from astroquery.utils.commons import ASTROPY_LT_4_1
34+
from astroquery.exceptions import CorruptDataWarning
3435

3536
__all__ = {'AlmaClass', 'ALMA_BANDS'}
3637

@@ -685,7 +686,8 @@ def _HEADER_data_size(self, files):
685686
return data_sizes, totalsize.to(u.GB)
686687

687688
def download_files(self, files, savedir=None, cache=True,
688-
continuation=True, skip_unauthorized=True,):
689+
continuation=True, skip_unauthorized=True,
690+
verify_only=False):
689691
"""
690692
Given a list of file URLs, download them
691693
@@ -706,6 +708,10 @@ def download_files(self, files, savedir=None, cache=True,
706708
If you receive "unauthorized" responses for some of the download
707709
requests, skip over them. If this is False, an exception will be
708710
raised.
711+
verify_only : bool
712+
Option to go through the process of checking the files to see if
713+
they're the right size, but not actually download them. This
714+
option may be useful if a previous download run failed partway.
709715
"""
710716

711717
if self.USERNAME:
@@ -743,15 +749,34 @@ def download_files(self, files, savedir=None, cache=True,
743749
filename = os.path.join(savedir,
744750
filename)
745751

752+
if verify_only:
753+
existing_file_length = os.stat(filename).st_size
754+
if 'content-length' in check_filename.headers:
755+
length = int(check_filename.headers['content-length'])
756+
if length == 0:
757+
warnings.warn('URL {0} has length=0'.format(url))
758+
elif existing_file_length == length:
759+
log.info(f"Found cached file {filename} with expected size {existing_file_length}.")
760+
elif existing_file_length < length:
761+
log.info(f"Found cached file {filename} with size {existing_file_length} < expected "
762+
f"size {length}. The download should be continued.")
763+
elif existing_file_length > length:
764+
warnings.warn(f"Found cached file {filename} with size {existing_file_length} > expected "
765+
f"size {length}. The download is likely corrupted.",
766+
CorruptDataWarning)
767+
else:
768+
warnings.warn(f"Could not verify {url} because it has no 'content-length'")
769+
746770
try:
747-
self._download_file(file_link,
748-
filename,
749-
timeout=self.TIMEOUT,
750-
auth=auth,
751-
cache=cache,
752-
method='GET',
753-
head_safe=False,
754-
continuation=continuation)
771+
if not verify_only:
772+
self._download_file(file_link,
773+
filename,
774+
timeout=self.TIMEOUT,
775+
auth=auth,
776+
cache=cache,
777+
method='GET',
778+
head_safe=False,
779+
continuation=continuation)
755780

756781
downloaded_files.append(filename)
757782
except requests.HTTPError as ex:

astroquery/alma/tests/test_alma_remote.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import shutil
44
import numpy as np
55
import pytest
6+
import warnings
67
from datetime import datetime
78
import os
89
from urllib.parse import urlparse
@@ -12,6 +13,7 @@
1213
from astropy import coordinates
1314
from astropy import units as u
1415

16+
from astroquery.exceptions import CorruptDataWarning
1517
from astroquery.utils.commons import ASTROPY_LT_4_1
1618
from .. import Alma
1719

@@ -655,3 +657,55 @@ def test_big_download_regression(alma):
655657
def test_download_html_file(alma):
656658
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'])
657659
assert result
660+
661+
662+
@pytest.mark.remote_data
663+
def test_verify_html_file(alma, caplog):
664+
# first, make sure the file is not cached (in case this test gets called repeatedly)
665+
# (we are hacking the file later in this test to trigger different failure modes so
666+
# we need it fresh)
667+
try:
668+
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
669+
local_filepath = result[0]
670+
os.remove(local_filepath)
671+
except FileNotFoundError:
672+
pass
673+
674+
caplog.clear()
675+
676+
# download the file
677+
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'])
678+
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
679+
680+
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
681+
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
682+
local_filepath = result[0]
683+
existing_file_length = 66336
684+
assert f"Found cached file {local_filepath} with expected size {existing_file_length}." in caplog.text
685+
686+
# manipulate the file
687+
with open(local_filepath, 'ab') as fh:
688+
fh.write(b"Extra Text")
689+
690+
caplog.clear()
691+
length = 66336
692+
existing_file_length = length + 10
693+
with pytest.warns(expected_warning=CorruptDataWarning,
694+
match=f"Found cached file {local_filepath} with size {existing_file_length} > expected size {length}. The download is likely corrupted."):
695+
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
696+
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
697+
698+
# manipulate the file: make it small
699+
with open(local_filepath, 'wb') as fh:
700+
fh.write(b"Empty Text")
701+
702+
caplog.clear()
703+
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
704+
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
705+
length = 66336
706+
existing_file_length = 10
707+
assert f"Found cached file {local_filepath} with size {existing_file_length} < expected size {length}. The download should be continued." in caplog.text
708+
709+
# cleanup: we don't want `test_download_html_file` to fail if this test is re-run
710+
if os.path.exists(local_filepath):
711+
os.remove(local_filepath)

astroquery/exceptions.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
__all__ = ['TimeoutError', 'InvalidQueryError', 'RemoteServiceError',
99
'TableParseError', 'LoginError', 'ResolverError',
1010
'NoResultsWarning', 'LargeQueryWarning', 'InputWarning',
11-
'AuthenticationWarning', 'MaxResultsWarning']
11+
'AuthenticationWarning', 'MaxResultsWarning', 'CorruptDataWarning']
1212

1313

1414
class TimeoutError(Exception):
@@ -98,6 +98,14 @@ class MaxResultsWarning(AstropyWarning):
9898
pass
9999

100100

101+
class CorruptDataWarning(AstropyWarning):
102+
"""
103+
Astroquery warning class to be issued when there is a sign that the
104+
(partially) downloaded data are corrupt.
105+
"""
106+
pass
107+
108+
101109
class EmptyResponseError(ValueError):
102110
"""
103111
Astroquery error class to be raised when the query returns an empty result

docs/alma/alma.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,17 @@ You can also do the downloading all in one step:
325325
326326
>>> myAlma.retrieve_data_from_uid(uids[0])
327327
328+
If you have huge files, sometimes the transfer fails, so you will need to
329+
restart the download. By default, the module will resume downloading where the
330+
failure occurred. You can check whether the downloads all succeeded before
331+
triggering a new download by using the ``verify_only`` keyword, which will not
332+
download but will return useful information about the state of your downloads:
333+
334+
.. code-block:: python
335+
336+
>>> myAlma.download_files(link_list, cache=True, verify_only=True)
337+
338+
328339
Downloading FITS data
329340
=====================
330341

0 commit comments

Comments
 (0)