Skip to content

Commit 729fd4d

Browse files
authored
Merge pull request #2385 from keflavich/issue2375
Fix molecule parsing issue for CDMS
2 parents ad6cb80 + 777d169 commit 729fd4d

File tree

8 files changed

+265
-61
lines changed

8 files changed

+265
-61
lines changed

CHANGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ jplsbdb
3232
- Fix a bug for jplsdbd query when the returned physical quantity contains
3333
a unit with exponential. [#2377]
3434

35+
linelists.cdms
36+
^^^^^^^^^^^^^^
37+
38+
- Fix issues with the line name parser and the line data parser; the original
39+
implementation was incomplete. [#2385]
3540

3641
Infrastructure, Utility and Other Changes and Additions
3742
-------------------------------------------------------

astroquery/linelists/cdms/core.py

Lines changed: 99 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
from astroquery.utils import async_to_sync
1111
# import configurable items declared in __init__.py
1212
from astroquery.linelists.cdms import conf
13-
from astroquery.jplspec import lookup_table
1413
from astroquery.exceptions import InvalidQueryError, EmptyResponseError
1514

15+
import re
16+
import string
1617

1718
__all__ = ['CDMS', 'CDMSClass']
1819

@@ -52,6 +53,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
5253
molecule : list, string of regex if parse_name_locally=True, optional
5354
Identifiers of the molecules to search for. If this parameter
5455
is not provided the search will match any species. Default is 'All'.
56+
As a first pass, the molecule will be searched for with a direct
57+
string match. If no string match is found, a regular expression
58+
match is attempted. Note that if the molecule name regex contains
59+
parentheses, they must be escaped. For example, 'H2C(CN)2.*' must be
60+
specified as 'H2C\\(CN\\)2.*' (but because of the first-attempt
61+
full-string matching, 'H2C(CN)2' will match that molecule
62+
successfully).
5563
5664
temperature_for_intensity : float
5765
The temperature to use when computing the intensity Smu^2. Set
@@ -126,12 +134,12 @@ def query_lines_async(self, min_frequency, max_frequency, *,
126134
if parse_name_locally:
127135
self.lookup_ids = build_lookup()
128136
luts = self.lookup_ids.find(molecule, flags)
129-
payload['Molecules'] = tuple(f"{val:06d} {key}"
130-
for key, val in luts.items())[0]
131-
if len(molecule) == 0:
137+
if len(luts) == 0:
132138
raise InvalidQueryError('No matching species found. Please '
133139
'refine your search or read the Docs '
134140
'for pointers on how to search.')
141+
payload['Molecules'] = tuple(f"{val:06d} {key}"
142+
for key, val in luts.items())[0]
135143
else:
136144
payload['Molecules'] = molecule
137145

@@ -187,12 +195,14 @@ def _parse_result(self, response, verbose=False):
187195
188196
ELO: Lower state energy in cm^{-1} relative to the ground state.
189197
GUP: Upper state degeneracy.
190-
TAG: Species tag or molecular identifier.
191-
A negative value flags that the line frequency has
192-
been measured in the laboratory. The absolute value of TAG is then the
193-
species tag and ERR is the reported experimental error. The three most
194-
significant digits of the species tag are coded as the mass number of
195-
the species.
198+
MOLWT: The first half of the molecular weight tag, which is the mass in atomic
199+
mass units (Daltons).
200+
TAG: Species tag or molecular identifier. This only includes the
201+
last 3 digits of the CDMS tag
202+
203+
A negative value of MOLWT flags that the line frequency has been
204+
measured in the laboratory. We record this boolean in the 'Lab'
205+
column. ERR is the reported experimental error.
196206
197207
QNFMT: Identifies the format of the quantum numbers
198208
Ju/Ku/vu and Jl/Kl/vl are the upper/lower QNs
@@ -215,15 +225,21 @@ def _parse_result(self, response, verbose=False):
215225
'DR': 36,
216226
'ELO': 38,
217227
'GUP': 48,
218-
'TAG': 51,
219-
'QNFMT': 57,
228+
'MOLWT': 51,
229+
'TAG': 54,
230+
'QNFMT': 58,
220231
'Ju': 61,
221232
'Ku': 63,
222233
'vu': 65,
223-
'Jl': 67,
224-
'Kl': 69,
225-
'vl': 71,
226-
'F': 73,
234+
'F1u': 67,
235+
'F2u': 69,
236+
'F3u': 71,
237+
'Jl': 73,
238+
'Kl': 75,
239+
'vl': 77,
240+
'F1l': 79,
241+
'F2l': 81,
242+
'F3l': 83,
227243
'name': 89}
228244

229245
result = ascii.read(text, header_start=None, data_start=0,
@@ -235,6 +251,18 @@ def _parse_result(self, response, verbose=False):
235251
result['FREQ'].unit = u.MHz
236252
result['ERR'].unit = u.MHz
237253

254+
result['Lab'] = result['MOLWT'] < 0
255+
result['MOLWT'] = np.abs(result['MOLWT'])
256+
result['MOLWT'].unit = u.Da
257+
258+
for suf in 'ul':
259+
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
260+
qnind = qn+suf
261+
if result[qnind].dtype != int:
262+
intcol = np.array(list(map(parse_letternumber, result[qnind])),
263+
dtype=int)
264+
result[qnind] = intcol
265+
238266
# if there is a crash at this step, something went wrong with the query
239267
# and the _last_query_temperature was not set. This shouldn't ever
240268
# happen, but, well, I anticipate it will.
@@ -303,12 +331,66 @@ def tryfloat(x):
303331
CDMS = CDMSClass()
304332

305333

334+
def parse_letternumber(st):
335+
"""
336+
Parse CDMS's two-letter QNs
337+
338+
From the CDMS docs:
339+
"Exactly two characters are available for each quantum number. Therefore, half
340+
integer quanta are rounded up ! In addition, capital letters are used to
341+
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
342+
types are used to signal corresponding negative quantum numbers."
343+
"""
344+
asc = string.ascii_lowercase
345+
ASC = string.ascii_uppercase
346+
newst = ''.join(['-' + str(asc.index(x)+10) if x in asc else
347+
str(ASC.index(x)+10) if x in ASC else
348+
x for x in st])
349+
return int(newst)
350+
351+
352+
class Lookuptable(dict):
353+
354+
def find(self, st, flags):
355+
"""
356+
Search dictionary keys for a regex match to string s
357+
358+
Parameters
359+
----------
360+
s : str
361+
String to compile as a regular expression
362+
Can be entered non-specific for broader results
363+
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
364+
or as the specific desired regular expression for
365+
catered results, for example: ('H20$' yields only 'H2O')
366+
367+
flags : int
368+
Regular expression flags.
369+
370+
Returns
371+
-------
372+
The list of values corresponding to the matches
373+
374+
"""
375+
376+
out = {}
377+
378+
for kk, vv in self.items():
379+
# note that the string-match attempt here differs from the jplspec
380+
# implementation
381+
match = (st in kk) or re.search(st, str(kk), flags=flags)
382+
if match:
383+
out[kk] = vv
384+
385+
return out
386+
387+
306388
def build_lookup():
307389

308390
result = CDMS.get_species_table()
309391
keys = list(result[1][:]) # convert NAME column to list
310392
values = list(result[0][:]) # convert TAG column to list
311393
dictionary = dict(zip(keys, values)) # make k,v dictionary
312-
lookuptable = lookup_table.Lookuptable(dictionary) # apply the class above
394+
lookuptable = Lookuptable(dictionary) # apply the class above
313395

314396
return lookuptable

astroquery/linelists/cdms/setup_package.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66

77
def get_package_data():
88

9-
paths_test = [os.path.join('data', 'CO.data')]
9+
paths_test = [os.path.join('data', 'CO.data'),
10+
os.path.join('data', 'HC7S.data'),
11+
os.path.join('data', 'post_response.html'),
12+
]
1013
paths_data = [os.path.join('data', 'catdir.cat')]
1114

1215
return {'astroquery.linelists.cdms.tests': paths_test,
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<!DOCTYPE html
2+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
4+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US">
5+
<head>
6+
<title>Untitled Document</title>
7+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
8+
</head>
9+
<body>
10+
<pre>
11+
100694.0650 0.4909 -3.9202 2 210.7681255 117501 224C6-1C7C7 C5 1C6C6 HC7S
12+
100694.0675 0.4909 -3.9237 2 210.7682253 117501 224C6-1C7C6 C5 1C6C5 HC7S
13+
100696.6906 0.4909 -3.9202 2 210.7790255 117501 224C6 1C7C7 C5-1C6C6 HC7S
14+
100696.6933 0.4909 -3.9237 2 210.7789253 117501 224C6 1C7C6 C5-1C6C5 HC7S
15+
100755.6075 0.4921 -4.0157 2 255.1740253 117501 224C7-1C7C6 C6 1C6C5 HC7S
16+
</pre></body></html>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<html><head>
2+
</head>
3+
<frameset rows="60,*" bordercolor="#ffffff">
4+
<frame src="/classic/predictions/cdmstabhead.html" border=0 frameborder=0 framespacing=0 marginheight=30 marginwidth=16 scrolling=no>
5+
<frame src="/classic/predictions/cdmscache/cdmstab{replace}.html" border=0 frameborder=0 framespacing=0 marginheight=0 marginwidth=12 scrolling=yes>
6+
</frameset>
7+
<body>Sorry, your browser does not support frames!<br>
8+
<a href="/classic/predictions/cdmscache/cdmstab{replace}.html">tabular</a></body>
9+
</html>
Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import numpy as np
2+
import pytest
23

34
import os
5+
import requests
46

57
from astropy import units as u
68
from astropy.table import Table
7-
from astroquery.linelists.cdms import CDMS
9+
from astroquery.linelists.cdms.core import CDMS, parse_letternumber
10+
from astroquery.utils.mocks import MockResponse
11+
12+
colname_set = set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP', 'TAG', 'QNFMT',
13+
'Ju', 'Jl', "vu", "F1u", "F2u", "F3u", "vl", "Ku", "Kl",
14+
"F1l", "F2l", "F3l", "name", "MOLWT", "Lab"])
815

916

1017
def data_path(filename):
@@ -13,26 +20,36 @@ def data_path(filename):
1320
return os.path.join(data_dir, filename)
1421

1522

16-
class MockResponseSpec:
23+
def mockreturn(*args, method='GET', data={}, url='', **kwargs):
24+
if method == 'GET':
25+
molecule = url.split('cdmstab')[1].split('.')[0]
26+
with open(data_path(molecule+".data"), 'rb') as fh:
27+
content = fh.read()
28+
return MockResponse(content=content)
29+
elif method == 'POST':
30+
molecule = dict(data)['Molecules']
31+
with open(data_path("post_response.html"), 'r') as fh:
32+
content = fh.read().format(replace=molecule).encode()
33+
return MockResponse(content=content)
34+
1735

18-
def __init__(self, filename):
19-
self.filename = data_path(filename)
36+
@pytest.fixture
37+
def patch_post(request):
38+
mp = request.getfixturevalue("monkeypatch")
2039

21-
@property
22-
def text(self):
23-
with open(self.filename) as f:
24-
return f.read()
40+
mp.setattr(CDMS, '_request', mockreturn)
41+
return mp
2542

2643

2744
def test_input_async():
2845

2946
response = CDMS.query_lines_async(min_frequency=100 * u.GHz,
3047
max_frequency=1000 * u.GHz,
3148
min_strength=-500,
32-
molecule="028001 CO",
49+
molecule="028503 CO, v=0",
3350
get_query_payload=True)
3451
response = dict(response)
35-
assert response['Molecules'] == "028001 CO"
52+
assert response['Molecules'] == "028503 CO, v=0"
3653
np.testing.assert_almost_equal(response['MinNu'], 100.)
3754
np.testing.assert_almost_equal(response['MaxNu'], 1000.)
3855

@@ -51,15 +68,61 @@ def test_input_multi():
5168
np.testing.assert_almost_equal(response['MaxNu'], 1000.)
5269

5370

54-
def test_query():
71+
def test_query(patch_post):
5572

56-
response = MockResponseSpec('CO.data')
57-
tbl = CDMS._parse_result(response)
73+
tbl = CDMS.query_lines(min_frequency=100 * u.GHz,
74+
max_frequency=1000 * u.GHz,
75+
min_strength=-500,
76+
molecule="CO")
5877
assert isinstance(tbl, Table)
5978
assert len(tbl) == 8
60-
assert set(tbl.keys()) == set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP',
61-
'TAG', 'QNFMT', 'Ju', 'Jl', "vu", "vl", "Ku", "Kl", "F", "name"])
79+
assert set(tbl.keys()) == colname_set
6280

6381
assert tbl['FREQ'][0] == 115271.2018
6482
assert tbl['ERR'][0] == .0005
6583
assert tbl['LGINT'][0] == -7.1425
84+
85+
86+
def test_parseletternumber():
87+
"""
88+
Very Important:
89+
Exactly two characters are available for each quantum number. Therefore, half
90+
integer quanta are rounded up ! In addition, capital letters are used to
91+
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
92+
types are used to signal corresponding negative quantum numbers.
93+
"""
94+
95+
# examples from the docs
96+
assert parse_letternumber("A0") == 100
97+
assert parse_letternumber("Z9") == 359
98+
99+
# inferred?
100+
assert parse_letternumber("z9") == -359
101+
assert parse_letternumber("ZZ") == 3535
102+
103+
104+
def test_hc7s(patch_post):
105+
"""
106+
Test for a very complicated molecule
107+
108+
CDMS.query_lines_async(100*u.GHz, 100.755608*u.GHz, molecule='HC7S', parse_name_locally=True)
109+
"""
110+
111+
tbl = CDMS.query_lines(100*u.GHz, 100.755608*u.GHz, molecule='HC7S',)
112+
assert isinstance(tbl, Table)
113+
assert len(tbl) == 5
114+
assert set(tbl.keys()) == colname_set
115+
116+
assert tbl['FREQ'][0] == 100694.065
117+
assert tbl['ERR'][0] == 0.4909
118+
assert tbl['LGINT'][0] == -3.9202
119+
assert tbl['MOLWT'][0] == 117
120+
121+
assert tbl['Ju'][0] == 126
122+
assert tbl['Jl'][0] == 125
123+
assert tbl['vu'][0] == 127
124+
assert tbl['vl'][0] == 126
125+
assert tbl['Ku'][0] == -1
126+
assert tbl['Kl'][0] == 1
127+
assert tbl['F1u'][0] == 127
128+
assert tbl['F1l'][0] == 126

0 commit comments

Comments
 (0)