Skip to content

Commit 742289f

Browse files
committed
add tool to retrieve whole molecule cat, add check for known
malformatted catalogs, improve LUT, refactor and significantly robustify CDMS data table handling. Also, update cached metadata files add more regression tests whitespace whitespace setup for data file almost there ... unicode bad col name recode unicode raised minus (character 96) as simple dash try replacing the text before ascii-reading it oops dashes try a different approach... try a different approach... part 2 super minor remove unnecessary test that I just added allow lookuptable to skip regex searching for exact matches no single-char variables looks like my fixes didn't work, and of course I made a bunch of stupid errors. I might have to give up for the night hooray! got a different error this time. Now just guesswork though.... Entry column changelog CI: ignoring a Deprecation we don't directly use but trigger somewhere in the stack finish test update docstr, inline comment
1 parent ca584f6 commit 742289f

File tree

11 files changed

+2904
-1227
lines changed

11 files changed

+2904
-1227
lines changed

CHANGES.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ New Tools and Services
88
Service fixes and enhancements
99
------------------------------
1010

11+
linelists.cdms
12+
^^^^^^^^^^^^^^
13+
14+
- Add whole catalog retrieval, improve error messaging for unparseable lines,
15+
improve metadata catalog, and improve lookuptable behavior [#3173,#2901]
16+
17+
jplspec
18+
^^^^^^^
19+
20+
- minor improvement to lookuptable behavior [#3173,#2901]
21+
1122

1223
Infrastructure, Utility and Other Changes and Additions
1324
-------------------------------------------------------

astroquery/jplspec/lookup_table.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
class Lookuptable(dict):
66

7-
def find(self, s, flags):
7+
def find(self, st, flags):
88
"""
99
Search dictionary keys for a regex match to string s
1010
1111
Parameters
1212
----------
13-
s : str
13+
st : str
1414
String to compile as a regular expression
1515
Can be entered non-specific for broader results
1616
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
@@ -22,17 +22,20 @@ def find(self, s, flags):
2222
2323
Returns
2424
-------
25-
The list of values corresponding to the matches
25+
The dictionary containing only values whose keys match the regex
2626
2727
"""
2828

29-
R = re.compile(s, flags)
29+
if st in self:
30+
return {st: self[st]}
31+
32+
R = re.compile(st, flags)
3033

3134
out = {}
3235

33-
for k, v in self.items():
34-
match = R.search(str(k))
36+
for key, val in self.items():
37+
match = R.search(str(key))
3538
if match:
36-
out[k] = v
39+
out[key] = val
3740

3841
return out

astroquery/linelists/cdms/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,25 @@ class Conf(_config.ConfigNamespace):
1414
Configuration parameters for `astroquery.linelists.cdms`.
1515
"""
1616
server = _config.ConfigItem(
17+
'https://cdms.astro.uni-koeln.de/',
18+
'CDMS Search and Conversion Form URL.')
19+
20+
search = _config.ConfigItem(
1721
'https://cdms.astro.uni-koeln.de/cgi-bin/cdmssearch',
1822
'CDMS Search and Conversion Form URL.')
1923

2024
catfile_url = _config.ConfigItem(
2125
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
2226
'CDMS partition function table listing all available molecules.')
2327

28+
catfile_url2 = _config.ConfigItem(
29+
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
30+
'CDMS catalog table listing all available molecules (with different names from partition function).')
31+
32+
classic_server = _config.ConfigItem(
33+
'https://cdms.astro.uni-koeln.de/classic',
34+
'CDMS Classic Molecule List server.')
35+
2436
timeout = _config.ConfigItem(
2537
60,
2638
'Time limit for connecting to the CDMS server.')

astroquery/linelists/cdms/core.py

Lines changed: 208 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from bs4 import BeautifulSoup
77
import astropy.units as u
8+
from astropy import table
89
from astropy.io import ascii
910
from astroquery.query import BaseQuery
1011
from astroquery.utils import async_to_sync
@@ -26,8 +27,11 @@ def data_path(filename):
2627
@async_to_sync
2728
class CDMSClass(BaseQuery):
2829
# use the Configuration Items imported from __init__.py
29-
URL = conf.server
30+
URL = conf.search
31+
SERVER = conf.server
32+
CLASSIC_URL = conf.classic_server
3033
TIMEOUT = conf.timeout
34+
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']
3135

3236
def query_lines_async(self, min_frequency, max_frequency, *,
3337
min_strength=-500, molecule='All',
@@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
143147
else:
144148
payload['Molecules'] = molecule
145149

146-
payload = list(payload.items())
147-
148150
if get_query_payload:
149151
return payload
150152
# BaseQuery classes come with a _request method that includes a
@@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
170172
response2 = self._request(method='GET', url=fullurl,
171173
timeout=self.TIMEOUT, cache=cache)
172174

175+
# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
176+
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
177+
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
178+
if payload['Molecules'] in badlist:
179+
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
180+
f"Try get_molecule({payload['Molecules']}) instead.")
181+
173182
return response2
174183

175184
def _parse_result(self, response, *, verbose=False):
@@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):
278287

279288
return result
280289

281-
def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
282-
catfile_url=conf.catfile_url):
290+
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
291+
catfile_url=conf.catfile_url,
292+
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
283293
"""
284294
A directory of the catalog is found in a file called 'catdir.cat.'
285295
@@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
302312
"""
303313

304314
if use_cached:
305-
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
315+
try:
316+
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
317+
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
318+
except UnicodeDecodeError:
319+
with open(data_path(catfile), 'rb') as fh:
320+
content = fh.read()
321+
text = content.decode('ascii', errors='replace')
322+
result = ascii.read(text, format='basic', delimiter='|')
323+
with open(data_path(catfile2), 'rb') as fh:
324+
content = fh.read()
325+
text = content.decode('ascii', errors='replace')
326+
result2 = ascii.read(text, format='basic', delimiter='|')
306327
else:
307328
result = retrieve_catfile(catfile_url)
329+
result2 = retrieve_catfile2(catfile_url2)
330+
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
331+
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)
332+
333+
merged = table.join(result, result2, keys=['tag'])
334+
if not all(merged['#lines'] == merged['# lines']):
335+
raise ValueError("Inconsistent table of molecules from CDMS.")
336+
del merged['# lines']
337+
338+
# reorder columns
339+
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
340+
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
341+
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
342+
'lg(Q(2.725))',
343+
'Ver.', 'Documentation', 'Date of entry', 'Entry']]
308344

309345
meta = {'lg(Q(1000))': 1000.0,
310346
'lg(Q(500))': 500.0,
@@ -331,6 +367,96 @@ def tryfloat(x):
331367
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
332368
37.5, 18.75, 9.375, 5., 2.725]}
333369

370+
result.add_index('tag')
371+
372+
return result
373+
374+
def get_molecule(self, molecule_id, *, cache=True):
375+
"""
376+
Retrieve the whole molecule table for a given molecule id
377+
"""
378+
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
379+
raise ValueError("molecule_id should be a length-6 string of numbers")
380+
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
381+
response = self._request(method='GET', url=url,
382+
timeout=self.TIMEOUT, cache=cache)
383+
result = self._parse_cat(response)
384+
385+
species_table = self.get_species_table()
386+
result.meta = dict(species_table.loc[int(molecule_id)])
387+
388+
return result
389+
390+
def _parse_cat(self, response, *, verbose=False):
391+
"""
392+
Parse a catalog response into an `~astropy.table.Table`
393+
394+
See details in _parse_response; this is a very similar function,
395+
but the catalog responses have a slightly different format.
396+
"""
397+
398+
if 'Zero lines were found' in response.text:
399+
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")
400+
401+
text = response.text
402+
403+
# notes about the format
404+
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
405+
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
406+
starts = {'FREQ': 0,
407+
'ERR': 14,
408+
'LGINT': 22,
409+
'DR': 30,
410+
'ELO': 32,
411+
'GUP': 42,
412+
'TAG': 45,
413+
'QNFMT': 52,
414+
'Q1': 56,
415+
'Q2': 58,
416+
'Q3': 60,
417+
'Q4': 62,
418+
'Q5': 64,
419+
'Q6': 66,
420+
'Q7': 68,
421+
'Q8': 70,
422+
'Q9': 72,
423+
'Q10': 74,
424+
'Q11': 76,
425+
'Q12': 78,
426+
'Q13': 80,
427+
'Q14': 82,
428+
}
429+
430+
result = ascii.read(text, header_start=None, data_start=0,
431+
comment=r'THIS|^\s{12,14}\d{4,6}.*',
432+
names=list(starts.keys()),
433+
col_starts=list(starts.values()),
434+
format='fixed_width', fast_reader=False)
435+
436+
# int truncates - which is what we want
437+
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]
438+
439+
result['FREQ'].unit = u.MHz
440+
result['ERR'].unit = u.MHz
441+
442+
result['Lab'] = result['MOLWT'] < 0
443+
result['MOLWT'] = np.abs(result['MOLWT'])
444+
result['MOLWT'].unit = u.Da
445+
446+
fix_keys = ['GUP']
447+
for suf in '':
448+
for qn in (f'Q{ii}' for ii in range(1, 15)):
449+
qnind = qn+suf
450+
fix_keys.append(qnind)
451+
for key in fix_keys:
452+
if not np.issubdtype(result[key].dtype, np.integer):
453+
intcol = np.array(list(map(parse_letternumber, result[key])),
454+
dtype=int)
455+
result[key] = intcol
456+
457+
result['LGINT'].unit = u.nm**2 * u.MHz
458+
result['ELO'].unit = u.cm**(-1)
459+
334460
return result
335461

336462

@@ -375,10 +501,13 @@ def find(self, st, flags):
375501
376502
Returns
377503
-------
378-
The list of values corresponding to the matches
504+
The dictionary containing only values whose keys match the regex
379505
380506
"""
381507

508+
if st in self:
509+
return {st: self[st]}
510+
382511
out = {}
383512

384513
for kk, vv in self.items():
@@ -394,24 +523,89 @@ def find(self, st, flags):
394523
def build_lookup():
395524

396525
result = CDMS.get_species_table()
526+
527+
# start with the 'molecule' column
397528
keys = list(result['molecule'][:]) # convert NAME column to list
398529
values = list(result['tag'][:]) # convert TAG column to list
399530
dictionary = dict(zip(keys, values)) # make k,v dictionary
531+
532+
# repeat with the Name column
533+
keys = list(result['Name'][:])
534+
values = list(result['tag'][:])
535+
dictionary2 = dict(zip(keys, values))
536+
dictionary.update(dictionary2)
537+
400538
lookuptable = Lookuptable(dictionary) # apply the class above
401539

402540
return lookuptable
403541

404542

405-
def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html'):
543+
def retrieve_catfile(url=f'{conf.classic_server}/entries/partition_function.html'):
406544
"""
407545
Simple retrieve index function
408546
"""
409547
response = requests.get(url)
410548
response.raise_for_status()
411-
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
412-
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
413-
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
414-
'lg(Q(2.725))'],
415-
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
416-
format='fixed_width', delimiter=' ')
549+
lines = response.text.split("\n")
550+
551+
# used to convert '---' to nan
552+
def tryfloat(x):
553+
try:
554+
return float(x)
555+
except ValueError:
556+
return np.nan
557+
558+
# the 'fixed width' table reader fails because there are rows that violate fixed width
559+
tbl_rows = []
560+
for row in lines[15:-5]:
561+
split = row.split()
562+
tag = int(split[0])
563+
molecule_and_lines = row[7:41]
564+
molecule = " ".join(molecule_and_lines.split()[:-1])
565+
nlines = int(molecule_and_lines.split()[-1])
566+
partfunc = map(tryfloat, row[41:].split())
567+
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
568+
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
569+
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
570+
tbl_rows.append({'tag': tag,
571+
'molecule': molecule,
572+
'#lines': nlines,
573+
})
574+
tbl_rows[-1].update(partfunc_dict)
575+
tbl = table.Table(tbl_rows)
576+
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
577+
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
578+
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
579+
# 'lg(Q(2.725))'],
580+
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
581+
# format='fixed_width', delimiter=' ')
582+
return tbl
583+
584+
585+
def retrieve_catfile2(url=f'{conf.classic_server}/predictions/catalog/catdir.html'):
586+
"""
587+
Simple retrieve index function
588+
"""
589+
response = requests.get(url)
590+
response.raise_for_status()
591+
try:
592+
tbl = ascii.read(response.text, format='html')
593+
except UnicodeDecodeError:
594+
# based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
595+
# which suggests to start with the bytecode content and decode with 'replace errors'
596+
text = response.content.decode('ascii', errors='replace')
597+
tbl = ascii.read(text, format='html')
598+
599+
# delete a junk column (wastes space)
600+
del tbl['Catalog']
601+
602+
# for joining - want same capitalization
603+
tbl.rename_column("Tag", "tag")
604+
605+
# one of these is a unicode dash, the other is a normal dash.... in theory
606+
if 'Entry in cm–1' in tbl.colnames:
607+
tbl.rename_column('Entry in cm–1', 'Entry')
608+
if 'Entry in cm-1' in tbl.colnames:
609+
tbl.rename_column('Entry in cm-1', 'Entry')
610+
417611
return tbl

0 commit comments

Comments
 (0)