Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ linelists.cdms

- Add a keyword to control writing of new species cache files.
This is needed to prevent tests from overwriting those files. [#3297]
- Add more complete support for CDMS quantum number and other value parsing. [#3302]

mast
^^^^
Expand Down
223 changes: 138 additions & 85 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# import configurable items declared in __init__.py
from astroquery.linelists.cdms import conf
from astroquery.exceptions import InvalidQueryError, EmptyResponseError
from astroquery import log

import re
import string
Expand All @@ -31,7 +32,7 @@ class CDMSClass(BaseQuery):
SERVER = conf.server
CLASSIC_URL = conf.classic_server
TIMEOUT = conf.timeout
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028528 H2NC', '058501 H2C2S', '064527 HC3HCN']

def query_lines_async(self, min_frequency, max_frequency, *,
min_strength=-500, molecule='All',
Expand All @@ -54,7 +55,8 @@ def query_lines_async(self, min_frequency, max_frequency, *,
min_strength : int, optional
Minimum strength in catalog units, the default is -500

molecule : list, string of regex if parse_name_locally=True, optional
molecule : list or string if parse_name_locally=False,
string of regex if parse_name_locally=True, optional
Identifiers of the molecules to search for. If this parameter
is not provided the search will match any species. Default is 'All'.
As a first pass, the molecule will be searched for with a direct
Expand Down Expand Up @@ -134,18 +136,21 @@ def query_lines_async(self, min_frequency, max_frequency, *,
# changes interpretation of query
self._last_query_temperature = temperature_for_intensity

if molecule is not None:
if parse_name_locally:
self.lookup_ids = build_lookup()
luts = self.lookup_ids.find(molecule, flags)
if len(luts) == 0:
raise InvalidQueryError('No matching species found. Please '
'refine your search or read the Docs '
'for pointers on how to search.')
payload['Molecules'] = tuple(f"{val:06d} {key}"
for key, val in luts.items())[0]
else:
payload['Molecules'] = molecule
if molecule == 'All':
payload['Moleculesgrp'] = 'all species'
else:
if molecule is not None:
if parse_name_locally:
self.lookup_ids = build_lookup()
luts = self.lookup_ids.find(molecule, flags)
if len(luts) == 0:
raise InvalidQueryError('No matching species found. Please '
'refine your search or read the Docs '
'for pointers on how to search.')
payload['Molecules'] = tuple(f"{val:06d} {key}"
for key, val in luts.items())[0]
else:
payload['Molecules'] = molecule

if get_query_payload:
return payload
Expand Down Expand Up @@ -180,7 +185,7 @@ def query_lines_async(self, min_frequency, max_frequency, *,
# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
if payload['Molecules'] in badlist:
if 'Moleculesgrp' not in payload.keys() and payload['Molecules'] in badlist:
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
f"Try get_molecule({payload['Molecules']}) instead.")

Expand Down Expand Up @@ -233,15 +238,32 @@ def _parse_result(self, response, *, verbose=False):
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.find('pre').text

need_to_filter_bad_molecules = False
for bad_molecule in self.MALFORMATTED_MOLECULE_LIST:
if text.find(bad_molecule.split()[1]) > -1:
need_to_filter_bad_molecules = True
break
if need_to_filter_bad_molecules:
text_new = ''
text = text.split('\n')
for line in text:
need_to_include_line = True
for bad_molecule in self.MALFORMATTED_MOLECULE_LIST:
if line.find(bad_molecule.split()[1]) > -1:
need_to_include_line = False
break
if need_to_include_line:
text_new = text_new + '\n' + line
text = text_new

starts = {'FREQ': 0,
'ERR': 14,
'LGINT': 25,
'DR': 36,
'ELO': 38,
'GUP': 47,
'MOLWT': 51,
'TAG': 54,
'QNFMT': 58,
'TAG': 50,
'QNFMT': 57,
'Ju': 61,
'Ku': 63,
'vu': 65,
Expand All @@ -256,39 +278,47 @@ def _parse_result(self, response, *, verbose=False):
'F3l': 83,
'name': 89}

result = ascii.read(text, header_start=None, data_start=0,
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in 'ul':
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
result[key] = intcol

# if there is a crash at this step, something went wrong with the query
# and the _last_query_temperature was not set. This shouldn't ever
# happen, but, well, I anticipate it will.
if self._last_query_temperature == 0:
result.rename_column('LGINT', 'LGAIJ')
result['LGAIJ'].unit = u.s**-1
else:
result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)
try:
result = ascii.read(text, header_start=None, data_start=0,
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['MOLWT'] = [int(x/1e3) for x in result['TAG']]
result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in 'ul':
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
result[key] = intcol

# if there is a crash at this step, something went wrong with the query
# and the _last_query_temperature was not set. This shouldn't ever
# happen, but, well, I anticipate it will.
if self._last_query_temperature == 0:
result.rename_column('LGINT', 'LGAIJ')
result['LGAIJ'].unit = u.s**-1
else:
result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)
except ValueError as ex:
# Give users a more helpful exception when parsing fails
new_message = ("Failed to parse CDMS response. This may be caused by a malformed search return. "
"You can check this by running `CDMS.get_molecule('<id>')` instead; if it works, the "
"problem is caused by the CDMS search interface and cannot be worked around.")
raise ValueError(new_message) from ex

return result

Expand Down Expand Up @@ -387,35 +417,50 @@ def tryfloat(x):

return result

def get_molecule(self, molecule_id, *, cache=True):
def get_molecule(self, molecule_id, *, cache=True, return_response=False):
"""
Retrieve the whole molecule table for a given molecule id

Parameters
----------
molecule_id : str
The 6-digit molecule identifier as a string
cache : bool
Defaults to True. If set overrides global caching behavior.
See :ref:`caching documentation <astroquery_cache>`.
return_response : bool, optional
If True, return the raw `requests.Response` object instead of parsing
the response. If this is set, the response will be returned whether
or not it was successful. Default is False.
"""
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
raise ValueError("molecule_id should be a length-6 string of numbers")
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
response = self._request(method='GET', url=url,
timeout=self.TIMEOUT, cache=cache)
result = self._parse_cat(response)

if return_response:
return response

response.raise_for_status()

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

result = self._parse_cat(response.text)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we raise_for_status prior to this? Or maybe can we even upstream that raise for status and do it inside _request()? -- that case should/would be a follow-up rather than in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, just adding the raise_for_status here for now. Doing it in request is an idea, but then it changes the behavior of request from being near-equivalent to request.get/request.post to being very different - I don't like that, it will have a lot of secondary effects that I expect would make debugging harder.


species_table = self.get_species_table()
result.meta = dict(species_table.loc[int(molecule_id)])

return result

def _parse_cat(self, response, *, verbose=False):
def _parse_cat(self, text, *, verbose=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick, but maybe call it text_response :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function works on any .cat file from CDMS, so I'd like to consider making it public and leaving this as-is. Maybe that can be a different PR though.

"""
Parse a catalog response into an `~astropy.table.Table`

See details in _parse_response; this is a very similar function,
but the catalog responses have a slightly different format.
"""

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

text = response.text

# notes about the format
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
Expand All @@ -426,21 +471,21 @@ def _parse_cat(self, response, *, verbose=False):
'ELO': 32,
'GUP': 42,
'TAG': 44,
'QNFMT': 52,
'Q1': 56,
'Q2': 58,
'Q3': 60,
'Q4': 62,
'Q5': 64,
'Q6': 66,
'Q7': 68,
'Q8': 70,
'Q9': 72,
'Q10': 74,
'Q11': 76,
'Q12': 78,
'Q13': 80,
'Q14': 82,
'QNFMT': 51,
'Q1': 55,
'Q2': 57,
'Q3': 59,
'Q4': 61,
'Q5': 63,
'Q6': 65,
'Q7': 67,
'Q8': 69,
'Q9': 71,
'Q10': 73,
'Q11': 75,
'Q12': 77,
'Q13': 79,
'Q14': 81,
}

result = ascii.read(text, header_start=None, data_start=0,
Expand All @@ -450,7 +495,7 @@ def _parse_cat(self, response, *, verbose=False):
format='fixed_width', fast_reader=False)

# int truncates - which is what we want
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]
result['MOLWT'] = [int(x/1e3) for x in result['TAG']]

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz
Expand All @@ -460,15 +505,18 @@ def _parse_cat(self, response, *, verbose=False):
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in '':
for qn in (f'Q{ii}' for ii in range(1, 15)):
qnind = qn+suf
fix_keys.append(qnind)
for qn in (f'Q{ii}' for ii in range(1, 15)):
fix_keys.append(qn)
log.debug(f"fix_keys: {fix_keys} should include Q1, Q2, ..., Q14 and GUP")
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
if any(intcol == -999999):
intcol = np.ma.masked_where(intcol == -999999, intcol)
result[key] = intcol
if not np.issubdtype(result[key].dtype, np.integer):
raise ValueError(f"Failed to parse {key} as integer")

result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)
Expand All @@ -481,18 +529,23 @@ def _parse_cat(self, response, *, verbose=False):

def parse_letternumber(st):
"""
Parse CDMS's two-letter QNs
Parse CDMS's two-letter QNs into integers.

Masked values are converted to -999999.

From the CDMS docs:
"Exactly two characters are available for each quantum number. Therefore, half
integer quanta are rounded up ! In addition, capital letters are used to
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
types are used to signal corresponding negative quantum numbers."
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Lower case characters
are used similarly to signal negative quantum numbers smaller than –9. e. g., a0 is –10, b0 is –20, etc."
"""
if np.ma.is_masked(st):
return -999999

asc = string.ascii_lowercase
ASC = string.ascii_uppercase
newst = ''.join(['-' + str(asc.index(x)+10) if x in asc else
str(ASC.index(x)+10) if x in ASC else
newst = ''.join(['-' + str((asc.index(x)+1)) if x in asc else
str((ASC.index(x)+10)) if x in ASC else
x for x in st])
return int(newst)

Expand Down
6 changes: 5 additions & 1 deletion astroquery/linelists/cdms/tests/test_cdms.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_query(patch_post):
assert tbl['LGINT'][0] == -7.1425
assert tbl['GUP'][0] == 3
assert tbl['GUP'][7] == 17
assert tbl['MOLWT'][0] == 28


def test_parseletternumber():
Expand All @@ -99,9 +100,12 @@ def test_parseletternumber():
assert parse_letternumber("Z9") == 359

# inferred?
assert parse_letternumber("z9") == -359
assert parse_letternumber("a0") == -10
assert parse_letternumber("b0") == -20
assert parse_letternumber("ZZ") == 3535

assert parse_letternumber(np.ma.masked) == -999999


def test_hc7s(patch_post):
"""
Expand Down
Loading
Loading