Skip to content

Commit 54e6b58

Browse files
authored
Merge pull request #1683 from keflavich/alma_list_more_files
Fix for alma issue: show all files
2 parents 67db677 + 6b8fca6 commit 54e6b58

File tree

3 files changed

+98
-350
lines changed

3 files changed

+98
-350
lines changed

CHANGES.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ alma
3737
refactor. The user-facing API should remain mostly the same, but some
3838
service interruption may have occurred. Note that the ``stage_data`` column
3939
``uid`` has been renamed ``mous_uid``, which is a technical correction, and
40-
several columns have been added [#1644,#1665]
40+
several columns have been added [#1644,#1665,#1683]
41+
- The contents of tarfiles can be shown with the ``expand_tarfiles`` keyword
42+
to ``stage_data`` [#1683]
4143

4244

4345
Infrastructure, Utility and Other Changes and Additions

astroquery/alma/core.py

Lines changed: 46 additions & 230 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def _get_dataarchive_url(self):
240240
"on github.")
241241
return self.dataarchive_url
242242

243-
def stage_data(self, uids):
243+
def stage_data(self, uids, expand_tarfiles=False, return_json=False):
244244
"""
245245
Obtain table of ALMA files
246246
@@ -249,6 +249,13 @@ def stage_data(self, uids):
249249
uids : list or str
250250
A list of valid UIDs or a single UID.
251251
UIDs should have the form: 'uid://A002/X391d0b/X7b'
252+
expand_tarfiles : bool
253+
Expand the tarfiles to obtain lists of all contained files. If
254+
this is specified, the parent tarfile will *not* be included
255+
return_json : bool
256+
Return a list of the JSON data sets returned from the query. This
257+
is primarily intended as a debug routine, but may be useful if there
258+
are unusual scheduling block layouts.
252259
253260
Returns
254261
-------
@@ -280,33 +287,50 @@ def stage_data(self, uids):
280287
# this indicates a wrong server is being used;
281288
# the "pre-feb2020" stager will be phased out
282289
# when the new services are deployed
283-
return self.stage_data_prefeb2020(uids)
290+
raise RemoteServiceError("Failed query! This shouldn't happen - please "
291+
"report the issue as it may indicate a change in "
292+
"the ALMA servers.")
284293
else:
285294
raise
286-
if jdata['type'] != 'PROJECT':
287-
log.error("Skipped uid {uu} because it is not a project and"
288-
"lacks the appropriate metadata; it is a "
289-
"{jdata}".format(uu=uu, jdata=jdata['type']))
290-
continue
291-
table = uid_json_to_table(jdata)
292-
table['sizeInBytes'].unit = u.B
293-
table.rename_column('sizeInBytes', 'size')
294-
table.add_column(Column(data=['{dataarchive_url}/dataPortal/{name}'
295-
.format(dataarchive_url=dataarchive_url,
296-
name=name)
297-
for name in table['name']],
298-
name='URL'))
299-
300-
isp = self.is_proprietary(uid)
301-
table.add_column(Column(data=[isp for row in table],
302-
name='isProprietary'))
303-
304-
tables.append(table)
305-
log.debug("Completed metadata retrieval for {0}".format(uu))
295+
296+
if return_json:
297+
tables.append(jdata)
298+
else:
299+
if jdata['type'] != 'PROJECT':
300+
log.error("Skipped uid {uu} because it is not a project and"
301+
"lacks the appropriate metadata; it is a "
302+
"{jdata}".format(uu=uu, jdata=jdata['type']))
303+
continue
304+
if expand_tarfiles:
305+
table = uid_json_to_table(jdata, productlist=['ASDM',
306+
'PIPELINE_PRODUCT'])
307+
else:
308+
table = uid_json_to_table(jdata,
309+
productlist=['ASDM',
310+
'PIPELINE_PRODUCT'
311+
'PIPELINE_PRODUCT_TARFILE',
312+
'PIPELINE_AUXILIARY_TARFILE'])
313+
table['sizeInBytes'].unit = u.B
314+
table.rename_column('sizeInBytes', 'size')
315+
table.add_column(Column(data=['{dataarchive_url}/dataPortal/{name}'
316+
.format(dataarchive_url=dataarchive_url,
317+
name=name)
318+
for name in table['name']],
319+
name='URL'))
320+
321+
isp = self.is_proprietary(uid)
322+
table.add_column(Column(data=[isp for row in table],
323+
name='isProprietary'))
324+
325+
tables.append(table)
326+
log.debug("Completed metadata retrieval for {0}".format(uu))
306327

307328
if len(tables) == 0:
308329
raise ValueError("No valid UIDs supplied.")
309330

331+
if return_json:
332+
return tables
333+
310334
table = table_vstack(tables)
311335

312336
return table
@@ -330,167 +354,6 @@ def is_proprietary(self, uid):
330354

331355
return isp
332356

333-
def stage_data_prefeb2020(self, uids):
334-
"""
335-
Stage ALMA data - old server style
336-
337-
NOTE: this method will be removed when a new ALMA service is deployed
338-
in March 2020
339-
340-
Parameters
341-
----------
342-
uids : list or str
343-
A list of valid UIDs or a single UID.
344-
UIDs should have the form: 'uid://A002/X391d0b/X7b'
345-
346-
Returns
347-
-------
348-
data_file_table : Table
349-
A table containing 3 columns: the UID, the file URL (for future
350-
downloading), and the file size
351-
"""
352-
353-
"""
354-
With log.set_level(10)
355-
INFO: Staging files... [astroquery.alma.core]
356-
DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core]
357-
DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core]
358-
DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core]
359-
DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core]
360-
DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core]
361-
.DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core]
362-
"""
363-
364-
import time
365-
from requests import HTTPError
366-
from ..utils import url_helpers
367-
import sys
368-
from six.moves.urllib_parse import urlparse
369-
370-
if isinstance(uids, six.string_types + (np.bytes_,)):
371-
uids = [uids]
372-
if not isinstance(uids, (list, tuple, np.ndarray)):
373-
raise TypeError("Datasets must be given as a list of strings.")
374-
375-
log.info("Staging files...")
376-
377-
self._get_dataarchive_url()
378-
379-
url = urljoin(self._get_dataarchive_url(), 'rh/submission')
380-
log.debug("First request URL: {0}".format(url))
381-
# 'ALMA+uid___A002_X391d0b_X7b'
382-
payload = {'dataset': ['ALMA+' + clean_uid(uid) for uid in uids]}
383-
log.debug("First request payload: {0}".format(payload))
384-
385-
self._staging_log = {'first_post_url': url}
386-
387-
# Request staging for the UIDs
388-
# This component cannot be cached, since the returned data can change
389-
# if new data are uploaded
390-
response = self._request('POST', url, data=payload,
391-
timeout=self.TIMEOUT, cache=False)
392-
self._staging_log['initial_response'] = response
393-
log.debug("First response URL: {0}".format(response.url))
394-
if 'login' in response.url:
395-
raise ValueError("You must login before downloading this data set.")
396-
397-
if response.status_code == 405:
398-
if hasattr(self, '_last_successful_staging_log'):
399-
log.warning("Error 405 received. If you have previously staged "
400-
"the same UIDs, the result returned is probably "
401-
"correct, otherwise you may need to create a fresh "
402-
"astroquery.Alma instance.")
403-
return self._last_successful_staging_log['result']
404-
else:
405-
raise HTTPError("Received an error 405: this may indicate you "
406-
"have already staged the data. Try downloading "
407-
"the file URLs directly with download_files.")
408-
response.raise_for_status()
409-
410-
if 'j_spring_cas_security_check' in response.url:
411-
time.sleep(1)
412-
# CANNOT cache this stage: it not a real data page! results in
413-
# infinite loops
414-
response = self._request('POST', url, data=payload,
415-
timeout=self.TIMEOUT, cache=False)
416-
self._staging_log['initial_response'] = response
417-
if 'j_spring_cas_security_check' in response.url:
418-
log.warning("Staging request was not successful. Try again?")
419-
response.raise_for_status()
420-
421-
if 'j_spring_cas_security_check' in response.url:
422-
raise RemoteServiceError("Could not access data. This error "
423-
"can arise if the data are private and "
424-
"you do not have access rights or are "
425-
"not logged in.")
426-
427-
# make sure the URL is formatted as expected, otherwise the request ID
428-
# will be wrong
429-
# (the request ID can also be found from the javascript in the request
430-
# response)
431-
if response.url.split("/")[-1] == 'submission':
432-
request_id = response.url.split("/")[-2]
433-
self._staging_log['request_id'] = request_id
434-
log.debug("Request ID: {0}".format(request_id))
435-
436-
# Submit a request for the specific request ID identified above
437-
submission_url = urljoin(self._get_dataarchive_url(),
438-
url_helpers.join('rh/submission', request_id))
439-
log.debug("Submission URL: {0}".format(submission_url))
440-
self._staging_log['submission_url'] = submission_url
441-
staging_submission = self._request('GET', submission_url, cache=True)
442-
self._staging_log['staging_submission'] = staging_submission
443-
staging_submission.raise_for_status()
444-
445-
data_page_url = staging_submission.url
446-
elif response.url.split("/")[-3] == 'requests':
447-
data_page_url = response.url
448-
449-
self._staging_log['data_page_url'] = data_page_url
450-
dpid = data_page_url.split("/")[-1]
451-
self._staging_log['staging_page_id'] = dpid
452-
453-
# CANNOT cache this step: please_wait will happen infinitely
454-
data_page = self._request('GET', data_page_url, cache=False)
455-
self._staging_log['data_page'] = data_page
456-
data_page.raise_for_status()
457-
458-
has_completed = False
459-
while not has_completed:
460-
time.sleep(1)
461-
summary = self._request('GET', url_helpers.join(data_page_url,
462-
'summary'),
463-
cache=False)
464-
summary.raise_for_status()
465-
print(".", end='')
466-
sys.stdout.flush()
467-
has_completed = summary.json()['complete']
468-
469-
self._staging_log['summary'] = summary
470-
summary.raise_for_status()
471-
self._staging_log['json_data'] = json_data = summary.json()
472-
473-
username = self.USERNAME if self.USERNAME else 'anonymous'
474-
475-
# templates:
476-
# https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/
477-
# 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar
478-
# uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar
479-
480-
url_decomposed = urlparse(data_page_url)
481-
base_url = ('{uri.scheme}://{uri.netloc}/'
482-
'dataPortal/requests/{username}/'
483-
'{staging_page_id}/ALMA'.format(uri=url_decomposed,
484-
staging_page_id=dpid,
485-
username=username,
486-
))
487-
tbl = self._json_summary_to_table(json_data, base_url=base_url)
488-
self._staging_log['result'] = tbl
489-
self._staging_log['file_urls'] = tbl['URL']
490-
self._last_successful_staging_log = self._staging_log
491-
492-
return tbl
493-
494357
def _HEADER_data_size(self, files):
495358
"""
496359
Given a list of file URLs, return the data size. This is useful for
@@ -1088,53 +951,6 @@ def _validate_payload(self, payload):
1088951
" by the ALMA query service:"
1089952
" {0}".format(invalid_params))
1090953

1091-
def _json_summary_to_table(self, data, base_url):
1092-
"""
1093-
Special tool to convert some JSON metadata to a table Obsolete as of
1094-
March 2020 - should be removed along with stage_data_prefeb2020
1095-
"""
1096-
from ..utils import url_helpers
1097-
from six import iteritems
1098-
columns = {'mous_uid': [], 'URL': [], 'size': []}
1099-
for entry in data['node_data']:
1100-
# de_type can be useful (e.g., MOUS), but it is not necessarily
1101-
# specified
1102-
# file_name and file_key *must* be specified.
1103-
is_file = (entry['file_name'] != 'null' and
1104-
entry['file_key'] != 'null')
1105-
if is_file:
1106-
# "de_name": "ALMA+uid://A001/X122/X35e",
1107-
columns['mous_uid'].append(entry['de_name'][5:])
1108-
if entry['file_size'] == 'null':
1109-
columns['size'].append(np.nan * u.Gbyte)
1110-
else:
1111-
columns['size'].append(
1112-
(int(entry['file_size']) * u.B).to(u.Gbyte))
1113-
# example template for constructing url:
1114-
# https://almascience.eso.org/dataPortal/requests/keflavich/940238268/ALMA/
1115-
# uid___A002_X9d6f4c_X154/2013.1.00546.S_uid___A002_X9d6f4c_X154.asdm.sdm.tar
1116-
# above is WRONG... except for ASDMs, when it's right
1117-
# should be:
1118-
# 2013.1.00546.S_uid___A002_X9d6f4c_X154.asdm.sdm.tar/2013.1.00546.S_uid___A002_X9d6f4c_X154.asdm.sdm.tar
1119-
#
1120-
# apparently ASDMs are different from others:
1121-
# templates:
1122-
# https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/
1123-
# 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar
1124-
# uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar
1125-
url = url_helpers.join(base_url,
1126-
entry['file_key'],
1127-
entry['file_name'])
1128-
if 'null' in url:
1129-
raise ValueError("The URL {0} was created containing "
1130-
"'null', which is invalid.".format(url))
1131-
columns['URL'].append(url)
1132-
1133-
columns['size'] = u.Quantity(columns['size'], u.Gbyte)
1134-
1135-
tbl = Table([Column(name=k, data=v) for k, v in iteritems(columns)])
1136-
return tbl
1137-
1138954
def get_project_metadata(self, projectid, cache=True):
1139955
"""
1140956
Get the metadata - specifically, the project abstract - for a given project ID.

0 commit comments

Comments
 (0)