Merge pull request #2535 from at88mph/alma-timeouts

bsipocz · web-flow · commit e2f3b5d4a5a2 · 2022-09-27T13:43:13.000-07:00
BUG: fix for alma timeout
diff --git a/astroquery/alma/core.py b/astroquery/alma/core.py
@@ -158,6 +158,9 @@
 # used to lookup the TAP service on an ARC
 TAP_SERVICE_PATH = 'tap'
 
+# standard ID URI to look for when expanding TAR files
+DATALINK_STANDARD_ID = 'ivo://ivoa.net/std/DataLink#links-1.0'
+
 # used to lookup the DataLink service on an ARC
 DATALINK_SERVICE_PATH = 'datalink/sync'
 
@@ -503,7 +506,7 @@ def _get_dataarchive_url(self):
         """
         if not hasattr(self, 'dataarchive_url'):
             if self.archive_url in ('http://almascience.org', 'https://almascience.org'):
-                response = self._request('GET', self.archive_url,
+                response = self._request('GET', self.archive_url, timeout=self.TIMEOUT,
                                          cache=False)
                 response.raise_for_status()
                 # Jan 2017: we have to force https because the archive doesn't
@@ -557,15 +560,18 @@ def get_data_info(self, uids, *, expand_tarfiles=False,
             raise TypeError("Datasets must be given as a list of strings.")
         # TODO remove this loop and send uids at once when pyvo fixed
         result = None
-        service_def_dict = {}
+        datalink_service_def_dict = {}
         for uid in uids:
             res = self.datalink.run_sync(uid)
             if res.status[0] != 'OK':
                 raise Exception('ERROR {}: {}'.format(res.status[0],
                                                       res.status[1]))
 
-            # Dictionary of service_def entries
-            service_def_dict.update({row.service_def: row.access_url for row in res.iter_procs()})
+            # Collect the ad-hoc DataLink services for later retrieval if expand_tarballs is set
+            if expand_tarfiles:
+                for adhoc_service in res.iter_adhocservices():
+                    if self.is_datalink_adhoc_service(adhoc_service):
+                        datalink_service_def_dict[adhoc_service.ID] = adhoc_service
 
             temp = res.to_table()
             if commons.ASTROPY_LT_4_1:
@@ -589,26 +595,19 @@ def get_data_info(self, uids, *, expand_tarfiles=False,
         if not with_rawdata:
             result = result[np.core.defchararray.find(
                 result['semantics'], '#progenitor') == -1]
-        # primary data delivery type is files packaged in tarballs. However
-        # some type of data has an alternative way to retrieve each individual
-        # file as an alternative (semantics='#datalink' and
-        # 'content_type=application/x-votable+xml;content=datalink'). They also
-        # require an extra call to the datalink service to get the list of
-        # files.
-        DATALINK_FILE_TYPE = 'application/x-votable+xml;content=datalink'
         # if expand_tarfiles:
         # identify the tarballs that can be expandable and replace them
         # with the list of components
         expanded_result = None
         to_delete = []
         if expand_tarfiles:
             for index, row in enumerate(result):
-                # Recursive DataLink, so look for service_def
-                if row['service_def'] and row['content_type'] == DATALINK_FILE_TYPE:
+                service_def_id = row['service_def']
+                # service_def record, so check if it points to a DataLink document
+                if service_def_id and service_def_id in datalink_service_def_dict:
                     # subsequent call to datalink
-
-                    # Lookup the access_url from the service_def RESOURCE entries.
-                    recursive_access_url = service_def_dict[row['service_def']]
+                    adhoc_service = datalink_service_def_dict[service_def_id]
+                    recursive_access_url = self.get_adhoc_service_access_url(adhoc_service)
                     file_id = recursive_access_url.split('ID=')[1]
                     expanded_tar = self.get_data_info(file_id)
                     expanded_tar = expanded_tar[
@@ -630,6 +629,18 @@ def get_data_info(self, uids, *, expand_tarfiles=False,
 
         return result
 
+    def is_datalink_adhoc_service(self, adhoc_service):
+        standard_id = self.get_adhoc_service_parameter(adhoc_service, 'standardID')
+        return standard_id == DATALINK_STANDARD_ID
+
+    def get_adhoc_service_access_url(self, adhoc_service):
+        return self.get_adhoc_service_parameter(adhoc_service, 'accessURL')
+
+    def get_adhoc_service_parameter(self, adhoc_service, parameter_id):
+        for p in adhoc_service.params:
+            if p.ID == parameter_id:
+                return p.value
+
     def is_proprietary(self, uid):
         """
         Given an ALMA UID, query the servers to determine whether it is
@@ -708,7 +719,7 @@ def download_files(self, files, *, savedir=None, cache=True,
         for file_link in unique(files):
             log.debug("Downloading {0} to {1}".format(file_link, savedir))
             try:
-                check_filename = self._request('HEAD', file_link, auth=auth)
+                check_filename = self._request('HEAD', file_link, auth=auth, timeout=self.TIMEOUT)
                 check_filename.raise_for_status()
             except requests.HTTPError as ex:
                 if ex.response.status_code == 401:
@@ -988,7 +999,7 @@ def _cycle0_tarfile_content(self):
         if not hasattr(self, '_cycle0_tarfile_content_table'):
             url = urljoin(self._get_dataarchive_url(),
                           'alma-data/archive/cycle-0-tarfile-content')
-            response = self._request('GET', url, cache=True)
+            response = self._request('GET', url, cache=True, timeout=self.TIMEOUT)
 
             # html.parser is needed because some <tr>'s have form:
             # <tr width="blah"> which the default parser does not pick up
diff --git a/astroquery/alma/tests/test_alma.py b/astroquery/alma/tests/test_alma.py
@@ -453,31 +453,32 @@ def _test_datalink_url(data_archive_url):
 
 
 def test_get_data_info():
-    datalink_mock = Mock()
-    dl_result = Table.read(data_path('alma-datalink.xml'), format='votable')
-
-    # Emulate the DatalinkResults
-    service_def_1 = type('', (object, ), {'service_def': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar', 'access_url': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar'})()
-    service_def_2 = type('', (object, ), {'service_def': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar', 'access_url': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar'})()
+    class MockDataLinkService:
+        def run_sync(self, uid):
+            return _mocked_datalink_sync(uid)
 
-    mock_response = Mock(to_table=Mock(return_value=dl_result), iter_procs=Mock(return_value=[service_def_1, service_def_2]))
-    mock_response.status = ['OK']
-    datalink_mock.run_sync.return_value = mock_response
     alma = Alma()
     alma._get_dataarchive_url = Mock()
-    alma._datalink = datalink_mock
+    alma._datalink = MockDataLinkService()
     result = alma.get_data_info(uids='uid://A001/X12a3/Xe9')
     assert len(result) == 9
 
-    datalink_mock.run_sync.assert_called_once_with('uid://A001/X12a3/Xe9')
-
 
 # This method will be used by the mock in test_get_data_info_expand_tarfiles to replace requests.get
 def _mocked_datalink_sync(*args, **kwargs):
     class MockResponse:
-        # Emulate the DatalinkResults
-        service_def_1 = type('', (object, ), {'service_def': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar', 'access_url': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar'})()
-        service_def_2 = type('', (object, ), {'service_def': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar', 'access_url': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar'})()
+        adhoc_service_1_param1 = type('', (object, ), {'ID': 'standardID', 'value': 'ivo://ivoa.net/std/DataLink#links-1.0'})()
+        adhoc_service_1_param2 = type('', (object, ), {'ID': 'accessURL', 'value': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar'})()
+        adhoc_service_1 = type('', (object, ), {'ID': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar', 'params': [adhoc_service_1_param1, adhoc_service_1_param2]})()
+
+        adhoc_service_2_param1 = type('', (object, ), {'ID': 'standardID', 'value': 'ivo://ivoa.net/std/DataLink#links-1.0'})()
+        adhoc_service_2_param2 = type('', (object, ), {'ID': 'accessURL', 'value': 'https://almascience.org/datalink/sync?ID=2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar'})()
+        adhoc_service_2 = type('', (object, ), {'ID': 'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar', 'params': [adhoc_service_1_param1, adhoc_service_1_param2]})()
+
+        adhoc_services = {
+            'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_001_of_001.tar': adhoc_service_1,
+            'DataLink.2017.1.01185.S_uid___A001_X12a3_Xe9_auxiliary.tar': adhoc_service_2
+        }
 
         def __init__(self, table):
             self.table = table
@@ -489,8 +490,11 @@ def to_table(self):
         def status(self):
             return ['OK']
 
-        def iter_procs(self):
-            return [self.service_def_1, self.service_def_2]
+        def iter_adhocservices(self):
+            return [self.adhoc_service_1, self.adhoc_service_2]
+
+        def get_adhocservice_by_id(self, adhoc_service_id):
+            return self.adhoc_services[adhoc_service_id]
 
     print(f"\n\nFOUND ARGS {args}\n\n")
 
@@ -514,8 +518,8 @@ def run_sync(self, uid):
     alma._datalink = MockDataLinkService()
     result = alma.get_data_info(uids='uid://A001/X12a3/Xe9', expand_tarfiles=True)
 
-    # Entire expanded structure is 61 links long.
-    assert len(result) == 61
+    # Entire expanded structure is 19 links long.
+    assert len(result) == 19
 
 
 def test_galactic_query():
diff --git a/astroquery/alma/tests/test_alma_remote.py b/astroquery/alma/tests/test_alma_remote.py
@@ -185,9 +185,13 @@ def test_data_info(self, tmp_path, alma):
         download_files_mock = Mock()
         alma.download_files = download_files_mock
         alma.retrieve_data_from_uid([uid])
-
-        comparison = download_files_mock.mock_calls[0][1] == data_info_tar['access_url']
-        assert comparison.all()
+        trimmed_access_url_list = [e for e in data_info_tar['access_url'].data if len(e) > 0]
+        trimmed_access_urls = (trimmed_access_url_list,)
+        mock_calls = download_files_mock.mock_calls[0][1]
+        print(f"\n\nComparing {mock_calls} to {trimmed_access_urls}\n\n")
+        # comparison = download_files_mock.mock_calls[0][1] == data_info_tar['access_url']
+        assert mock_calls == trimmed_access_urls
+        # assert comparison.all()
 
     def test_download_data(self, tmp_path, alma):
         # test only fits files from a program
diff --git a/astroquery/query.py b/astroquery/query.py
@@ -293,7 +293,7 @@ def _request(self, method, url,
                 local_filename = local_filename.replace(':', '_')
             local_filepath = os.path.join(savedir or self.cache_location or '.', local_filename)
 
-            response = self._download_file(url, local_filepath, cache=cache,
+            response = self._download_file(url, local_filepath, cache=cache, timeout=timeout,
                                            continuation=continuation, method=method,
                                            allow_redirects=allow_redirects,
                                            auth=auth, **req_kwargs)