55
66from bs4 import BeautifulSoup
77import astropy .units as u
8+ from astropy import table
89from astropy .io import ascii
910from astroquery .query import BaseQuery
1011from astroquery .utils import async_to_sync
@@ -26,8 +27,11 @@ def data_path(filename):
2627@async_to_sync
2728class CDMSClass (BaseQuery ):
2829 # use the Configuration Items imported from __init__.py
29- URL = conf .server
30+ URL = conf .search
31+ SERVER = conf .server
32+ CLASSIC_URL = conf .classic_server
3033 TIMEOUT = conf .timeout
34+ MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS' , '028582 H2NC' , '058501 H2C2S' , '064527 HC3HCN' ]
3135
3236 def query_lines_async (self , min_frequency , max_frequency , * ,
3337 min_strength = - 500 , molecule = 'All' ,
@@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
143147 else :
144148 payload ['Molecules' ] = molecule
145149
146- payload = list (payload .items ())
147-
148150 if get_query_payload :
149151 return payload
150152 # BaseQuery classes come with a _request method that includes a
@@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
170172 response2 = self ._request (method = 'GET' , url = fullurl ,
171173 timeout = self .TIMEOUT , cache = cache )
172174
175+ # accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
176+ badlist = (self .MALFORMATTED_MOLECULE_LIST + # noqa
177+ [y for x in self .MALFORMATTED_MOLECULE_LIST for y in x .split ()])
178+ if payload ['Molecules' ] in badlist :
179+ raise ValueError (f"Molecule { payload ['Molecules' ]} is known not to comply with standard CDMS format. "
180+ f"Try get_molecule({ payload ['Molecules' ]} ) instead." )
181+
173182 return response2
174183
175184 def _parse_result (self , response , * , verbose = False ):
@@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):
278287
279288 return result
280289
281- def get_species_table (self , * , catfile = 'catdir.cat' , use_cached = True ,
282- catfile_url = conf .catfile_url ):
290+ def get_species_table (self , * , catfile = 'partfunc.cat' , use_cached = True ,
291+ catfile_url = conf .catfile_url ,
292+ catfile2 = 'catdir.cat' , catfile_url2 = conf .catfile_url2 ):
283293 """
284294 A directory of the catalog is found in a file called 'catdir.cat.'
285295
@@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
302312 """
303313
304314 if use_cached :
305- result = ascii .read (data_path (catfile ), format = 'fixed_width' , delimiter = '|' )
315+ try :
316+ result = ascii .read (data_path (catfile ), format = 'fixed_width' , delimiter = '|' )
317+ result2 = ascii .read (data_path (catfile2 ), format = 'fixed_width' , delimiter = '|' )
318+ except UnicodeDecodeError :
319+ with open (data_path (catfile ), 'rb' ) as fh :
320+ content = fh .read ()
321+ text = content .decode ('ascii' , errors = 'replace' )
322+ result = ascii .read (text , format = 'basic' , delimiter = '|' )
323+ with open (data_path (catfile2 ), 'rb' ) as fh :
324+ content = fh .read ()
325+ text = content .decode ('ascii' , errors = 'replace' )
326+ result2 = ascii .read (text , format = 'basic' , delimiter = '|' )
306327 else :
307328 result = retrieve_catfile (catfile_url )
329+ result2 = retrieve_catfile2 (catfile_url2 )
330+ result .write (data_path (catfile ), format = 'ascii.fixed_width' , delimiter = '|' , overwrite = True )
331+ result2 .write (data_path (catfile2 ), format = 'ascii.fixed_width' , delimiter = '|' , overwrite = True )
332+
333+ merged = table .join (result , result2 , keys = ['tag' ])
334+ if not all (merged ['#lines' ] == merged ['# lines' ]):
335+ raise ValueError ("Inconsistent table of molecules from CDMS." )
336+ del merged ['# lines' ]
337+
338+ # reorder columns
339+ result = merged [['tag' , 'molecule' , 'Name' , '#lines' , 'lg(Q(1000))' ,
340+ 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' , 'lg(Q(150))' , 'lg(Q(75))' ,
341+ 'lg(Q(37.5))' , 'lg(Q(18.75))' , 'lg(Q(9.375))' , 'lg(Q(5.000))' ,
342+ 'lg(Q(2.725))' ,
343+ 'Ver.' , 'Documentation' , 'Date of entry' , 'Entry' ]]
308344
309345 meta = {'lg(Q(1000))' : 1000.0 ,
310346 'lg(Q(500))' : 500.0 ,
@@ -331,6 +367,96 @@ def tryfloat(x):
331367 result .meta = {'Temperature (K)' : [1000. , 500. , 300. , 225. , 150. , 75. ,
332368 37.5 , 18.75 , 9.375 , 5. , 2.725 ]}
333369
370+ result .add_index ('tag' )
371+
372+ return result
373+
374+ def get_molecule (self , molecule_id , * , cache = True ):
375+ """
376+ Retrieve the whole molecule table for a given molecule id
377+ """
378+ if not isinstance (molecule_id , str ) or len (molecule_id ) != 6 :
379+ raise ValueError ("molecule_id should be a length-6 string of numbers" )
380+ url = f'{ self .CLASSIC_URL } /entries/c{ molecule_id } .cat'
381+ response = self ._request (method = 'GET' , url = url ,
382+ timeout = self .TIMEOUT , cache = cache )
383+ result = self ._parse_cat (response )
384+
385+ species_table = self .get_species_table ()
386+ result .meta = dict (species_table .loc [int (molecule_id )])
387+
388+ return result
389+
390+ def _parse_cat (self , response , * , verbose = False ):
391+ """
392+ Parse a catalog response into an `~astropy.table.Table`
393+
394+ See details in _parse_response; this is a very similar function,
395+ but the catalog responses have a slightly different format.
396+ """
397+
398+ if 'Zero lines were found' in response .text :
399+ raise EmptyResponseError (f"Response was empty; message was '{ response .text } '." )
400+
401+ text = response .text
402+
403+ # notes about the format
404+ # [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
405+ # 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
406+ starts = {'FREQ' : 0 ,
407+ 'ERR' : 14 ,
408+ 'LGINT' : 22 ,
409+ 'DR' : 30 ,
410+ 'ELO' : 32 ,
411+ 'GUP' : 42 ,
412+ 'TAG' : 45 ,
413+ 'QNFMT' : 52 ,
414+ 'Q1' : 56 ,
415+ 'Q2' : 58 ,
416+ 'Q3' : 60 ,
417+ 'Q4' : 62 ,
418+ 'Q5' : 64 ,
419+ 'Q6' : 66 ,
420+ 'Q7' : 68 ,
421+ 'Q8' : 70 ,
422+ 'Q9' : 72 ,
423+ 'Q10' : 74 ,
424+ 'Q11' : 76 ,
425+ 'Q12' : 78 ,
426+ 'Q13' : 80 ,
427+ 'Q14' : 82 ,
428+ }
429+
430+ result = ascii .read (text , header_start = None , data_start = 0 ,
431+ comment = r'THIS|^\s{12,14}\d{4,6}.*' ,
432+ names = list (starts .keys ()),
433+ col_starts = list (starts .values ()),
434+ format = 'fixed_width' , fast_reader = False )
435+
436+ # int truncates - which is what we want
437+ result ['MOLWT' ] = [int (x / 1e4 ) for x in result ['TAG' ]]
438+
439+ result ['FREQ' ].unit = u .MHz
440+ result ['ERR' ].unit = u .MHz
441+
442+ result ['Lab' ] = result ['MOLWT' ] < 0
443+ result ['MOLWT' ] = np .abs (result ['MOLWT' ])
444+ result ['MOLWT' ].unit = u .Da
445+
446+ fix_keys = ['GUP' ]
447+ for suf in '' :
448+ for qn in (f'Q{ ii } ' for ii in range (1 , 15 )):
449+ qnind = qn + suf
450+ fix_keys .append (qnind )
451+ for key in fix_keys :
452+ if not np .issubdtype (result [key ].dtype , np .integer ):
453+ intcol = np .array (list (map (parse_letternumber , result [key ])),
454+ dtype = int )
455+ result [key ] = intcol
456+
457+ result ['LGINT' ].unit = u .nm ** 2 * u .MHz
458+ result ['ELO' ].unit = u .cm ** (- 1 )
459+
334460 return result
335461
336462
@@ -375,10 +501,13 @@ def find(self, st, flags):
375501
376502 Returns
377503 -------
378- The list of values corresponding to the matches
504+ The dictionary containing only values whose keys match the regex
379505
380506 """
381507
508+ if st in self :
509+ return {st : self [st ]}
510+
382511 out = {}
383512
384513 for kk , vv in self .items ():
@@ -394,24 +523,89 @@ def find(self, st, flags):
394523def build_lookup ():
395524
396525 result = CDMS .get_species_table ()
526+
527+ # start with the 'molecule' column
397528 keys = list (result ['molecule' ][:]) # convert NAME column to list
398529 values = list (result ['tag' ][:]) # convert TAG column to list
399530 dictionary = dict (zip (keys , values )) # make k,v dictionary
531+
532+ # repeat with the Name column
533+ keys = list (result ['Name' ][:])
534+ values = list (result ['tag' ][:])
535+ dictionary2 = dict (zip (keys , values ))
536+ dictionary .update (dictionary2 )
537+
400538 lookuptable = Lookuptable (dictionary ) # apply the class above
401539
402540 return lookuptable
403541
404542
405- def retrieve_catfile (url = 'https://cdms.astro.uni-koeln.de/classic /entries/partition_function.html' ):
543+ def retrieve_catfile (url = f' { conf . classic_server } /entries/partition_function.html' ):
406544 """
407545 Simple retrieve index function
408546 """
409547 response = requests .get (url )
410548 response .raise_for_status ()
411- tbl = ascii .read (response .text , header_start = None , data_start = 15 , data_end = - 5 ,
412- names = ['tag' , 'molecule' , '#lines' , 'lg(Q(1000))' , 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' ,
413- 'lg(Q(150))' , 'lg(Q(75))' , 'lg(Q(37.5))' , 'lg(Q(18.75))' , 'lg(Q(9.375))' , 'lg(Q(5.000))' ,
414- 'lg(Q(2.725))' ],
415- col_starts = (0 , 7 , 34 , 41 , 53 , 66 , 79 , 92 , 106 , 117 , 131 , 145 , 159 , 173 ),
416- format = 'fixed_width' , delimiter = ' ' )
549+ lines = response .text .split ("\n " )
550+
551+ # used to convert '---' to nan
552+ def tryfloat (x ):
553+ try :
554+ return float (x )
555+ except ValueError :
556+ return np .nan
557+
558+ # the 'fixed width' table reader fails because there are rows that violate fixed width
559+ tbl_rows = []
560+ for row in lines [15 :- 5 ]:
561+ split = row .split ()
562+ tag = int (split [0 ])
563+ molecule_and_lines = row [7 :41 ]
564+ molecule = " " .join (molecule_and_lines .split ()[:- 1 ])
565+ nlines = int (molecule_and_lines .split ()[- 1 ])
566+ partfunc = map (tryfloat , row [41 :].split ())
567+ partfunc_dict = dict (zip (['lg(Q(1000))' , 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' ,
568+ 'lg(Q(150))' , 'lg(Q(75))' , 'lg(Q(37.5))' , 'lg(Q(18.75))' ,
569+ 'lg(Q(9.375))' , 'lg(Q(5.000))' , 'lg(Q(2.725))' ], partfunc ))
570+ tbl_rows .append ({'tag' : tag ,
571+ 'molecule' : molecule ,
572+ '#lines' : nlines ,
573+ })
574+ tbl_rows [- 1 ].update (partfunc_dict )
575+ tbl = table .Table (tbl_rows )
576+ # tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
577+ # names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
578+ # 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
579+ # 'lg(Q(2.725))'],
580+ # col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
581+ # format='fixed_width', delimiter=' ')
582+ return tbl
583+
584+
585+ def retrieve_catfile2 (url = f'{ conf .classic_server } /predictions/catalog/catdir.html' ):
586+ """
587+ Simple retrieve index function
588+ """
589+ response = requests .get (url )
590+ response .raise_for_status ()
591+ try :
592+ tbl = ascii .read (response .text , format = 'html' )
593+ except UnicodeDecodeError :
594+ # based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
595+ # which suggests to start with the bytecode content and decode with 'replace errors'
596+ text = response .content .decode ('ascii' , errors = 'replace' )
597+ tbl = ascii .read (text , format = 'html' )
598+
599+ # delete a junk column (wastes space)
600+ del tbl ['Catalog' ]
601+
602+ # for joining - want same capitalization
603+ tbl .rename_column ("Tag" , "tag" )
604+
605+ # one of these is a unicode dash, the other is a normal dash.... in theory
606+ if 'Entry in cm–1' in tbl .colnames :
607+ tbl .rename_column ('Entry in cm–1' , 'Entry' )
608+ if 'Entry in cm-1' in tbl .colnames :
609+ tbl .rename_column ('Entry in cm-1' , 'Entry' )
610+
417611 return tbl
0 commit comments