Skip to content

Commit 2d77f0f

Browse files
committed
Merge pull request #526 from keflavich/vizier_efficiency_boost
Vizier queries seem slow
2 parents ebf8de5 + f0aae4b commit 2d77f0f

File tree

1 file changed

+160
-62
lines changed

1 file changed

+160
-62
lines changed

astroquery/vizier/core.py

Lines changed: 160 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
import warnings
66
import json
77
import copy
8+
import re
89

910
from astropy.extern import six
11+
from astropy.extern.six import BytesIO
1012
import astropy.units as u
1113
import astropy.coordinates as coord
1214
import astropy.table as tbl
1315
import astropy.utils.data as aud
1416
from astropy.utils import OrderedDict
1517
import astropy.io.votable as votable
18+
from astropy.io import ascii
1619

1720
from ..query import BaseQuery
1821
from ..utils import commons
@@ -110,6 +113,54 @@ def _server_to_url(self, return_type='votable'):
110113
FITS binary table: asu-binfits
111114
plain text: asu-txt
112115
"""
116+
117+
"""
118+
Quasi-private performance tests:
119+
It seems that these are dominated by table parsing time.
120+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False)
121+
1 loops, best of 3: 7.11 s per loop
122+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='votable', cache=False)
123+
1 loops, best of 3: 6.79 s per loop
124+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-fits', cache=False)
125+
1 loops, best of 3: 6.21 s per loop
126+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-binfits', cache=False)
127+
1 loops, best of 3: 667 ms per loop
128+
Looks like this one led to a segfault on their system?
129+
130+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-txt', cache=False)
131+
1 loops, best of 3: 6.83 s per loop
132+
%timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False)
133+
1 loops, best of 3: 6.8 s per loop
134+
135+
m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False)
136+
m83votable = Vizier.query_object_async('M83', return_type='votable', cache=False)
137+
m83fits = Vizier.query_object_async('M83', return_type='asu-fits', cache=False)
138+
m83txt = Vizier.query_object_async('M83', return_type='asu-txt', cache=False)
139+
#m83binfits = Vizier.query_object_async('M83', return_type='asu-binfits', cache=False)
140+
141+
# many of these are invalid tables
142+
%timeit fitstbls = fits.open(BytesIO(m83fits.content), ignore_missing_end=True)
143+
1 loops, best of 3: 541 ms per loop
144+
145+
%timeit tbls = parse_vizier_tsvfile(m83tsv.content)
146+
1 loops, best of 3: 1.35 s per loop
147+
148+
%timeit votbls = parse_vizier_votable(m83votable.content)
149+
1 loops, best of 3: 3.62 s per loop
150+
151+
"""
152+
# Only votable is supported now, but in case we try to support
153+
# something in the future we should disallow invalid ones.
154+
assert return_type in ('votable', 'asu-tsv', 'asu-fits',
155+
'asu-binfits', 'asu-txt')
156+
if return_type in ('asu-txt',):
157+
# I had a look at the format of these "tables" and... they just
158+
# aren't. They're quasi-fixed-width without schema. I think they
159+
# follow the general philosophy of "consistency is overrated"
160+
# The CDS reader chokes on it.
161+
raise TypeError("asu-txt is not and cannot be supported: the "
162+
"returned tables are not and cannot be made "
163+
"parseable.")
113164
return "http://" + self.VIZIER_SERVER + "/viz-bin/" + return_type
114165

115166
@property
@@ -126,7 +177,7 @@ def keywords(self):
126177
self._keywords = None
127178

128179
def find_catalogs(self, keywords, include_obsolete=False, verbose=False,
129-
max_catalogs=None):
180+
max_catalogs=None, return_type='votable'):
130181
"""
131182
Search Vizier for catalogs based on a set of keywords, e.g. author name
132183
@@ -168,12 +219,13 @@ def find_catalogs(self, keywords, include_obsolete=False, verbose=False,
168219
if max_catalogs is not None:
169220
data_payload['-meta.max'] = max_catalogs
170221
response = self._request(method='POST',
171-
url=self._server_to_url(),
222+
url=self._server_to_url(return_type=return_type),
172223
data=data_payload,
173224
timeout=self.TIMEOUT)
174225
if 'STOP, Max. number of RESOURCE reached' in response.text:
175-
raise ValueError("Maximum number of catalogs exceeded. Try setting max_catalogs "
176-
"to a large number and try again")
226+
raise ValueError("Maximum number of catalogs exceeded. Try "
227+
"setting max_catalogs to a large number and"
228+
" try again")
177229
result = self._parse_result(response, verbose=verbose, get_catalog_names=True)
178230

179231
# Filter out the obsolete catalogs, unless requested
@@ -185,7 +237,7 @@ def find_catalogs(self, keywords, include_obsolete=False, verbose=False,
185237

186238
return result
187239

188-
def get_catalogs_async(self, catalog, verbose=False):
240+
def get_catalogs_async(self, catalog, verbose=False, return_type='votable'):
189241
"""
190242
Query the Vizier service for a specific catalog
191243
@@ -202,13 +254,14 @@ def get_catalogs_async(self, catalog, verbose=False):
202254

203255
data_payload = self._args_to_payload(catalog=catalog)
204256
response = self._request(method='POST',
205-
url=self._server_to_url(),
257+
url=self._server_to_url(return_type=return_type),
206258
data=data_payload,
207259
timeout=self.TIMEOUT)
208260
return response
209261

210262
def query_object_async(self, object_name, catalog=None, radius=None,
211-
coordinate_frame=None):
263+
coordinate_frame=None, get_query_payload=False,
264+
return_type='votable', cache=True):
212265
"""
213266
Serves the same purpose as `query_object` but only
214267
returns the HTTP response rather than the parsed result.
@@ -248,15 +301,19 @@ def query_object_async(self, object_name, catalog=None, radius=None,
248301
data_payload = self._args_to_payload(
249302
center=center,
250303
catalog=catalog)
304+
if get_query_payload:
305+
return data_payload
251306
response = self._request(method='POST',
252-
url=self._server_to_url(),
307+
url=self._server_to_url(return_type=return_type),
253308
data=data_payload,
254-
timeout=self.TIMEOUT)
309+
timeout=self.TIMEOUT,
310+
cache=cache)
255311
return response
256312

257313
def query_region_async(self, coordinates, radius=None, inner_radius=None,
258314
width=None, height=None, catalog=None,
259-
get_query_payload=False):
315+
get_query_payload=False, cache=True,
316+
return_type='votable'):
260317
"""
261318
Serves the same purpose as `query_region` but only
262319
returns the HTTP response rather than the parsed result.
@@ -374,12 +431,15 @@ def query_region_async(self, coordinates, radius=None, inner_radius=None,
374431
return data_payload
375432

376433
response = self._request(method='POST',
377-
url=self._server_to_url(),
434+
url=self._server_to_url(return_type=return_type),
378435
data=data_payload,
379-
timeout=self.TIMEOUT)
436+
timeout=self.TIMEOUT,
437+
cache=cache)
380438
return response
381439

382-
def query_constraints_async(self, catalog=None, **kwargs):
440+
def query_constraints_async(self, catalog=None, return_type='votable',
441+
cache=True,
442+
**kwargs):
383443
"""
384444
Send a query to Vizier in which you specify constraints with keyword/value
385445
pairs.
@@ -437,9 +497,10 @@ def query_constraints_async(self, catalog=None, **kwargs):
437497
column_filters=kwargs,
438498
center={'-c.rd': 180})
439499
response = self._request(method='POST',
440-
url=self._server_to_url(),
500+
url=self._server_to_url(return_type=return_type),
441501
data=data_payload,
442-
timeout=self.TIMEOUT)
502+
timeout=self.TIMEOUT,
503+
cache=cache)
443504
return response
444505

445506
def _args_to_payload(self, *args, **kwargs):
@@ -530,7 +591,8 @@ def _args_to_payload(self, *args, **kwargs):
530591
script += "\n" + str(self.keywords)
531592
return script
532593

533-
def _parse_result(self, response, get_catalog_names=False, verbose=False, invalid='warn'):
594+
def _parse_result(self, response, get_catalog_names=False, verbose=False,
595+
invalid='warn'):
534596
"""
535597
Parses the HTTP response to create a `~astropy.table.Table`.
536598
@@ -541,9 +603,11 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, invali
541603
response : `requests.Response`
542604
The response of the HTTP POST request
543605
get_catalog_names : bool
606+
(only for VOTABLE queries)
544607
If specified, return only the table names (useful for table
545-
discovery)
608+
discovery).
546609
invalid : 'warn', 'mask' or 'raise'
610+
(only for VOTABLE queries)
547611
The behavior if a VOTABLE cannot be parsed. Default is 'warn',
548612
which will try to parse the table, then if an exception is raised,
549613
it will be printent but the masked table will be returned
@@ -553,51 +617,22 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, invali
553617
table_list : `astroquery.utils.TableList` or str
554618
If there are errors in the parsing, then returns the raw results as a string.
555619
"""
556-
if not verbose:
557-
commons.suppress_vo_warnings()
558-
try:
559-
tf = six.BytesIO(response.content)
560-
561-
if invalid == 'mask':
562-
vo_tree = votable.parse(tf, pedantic=False, invalid='mask')
563-
elif invalid == 'warn':
564-
try:
565-
vo_tree = votable.parse(tf, pedantic=False, invalid='raise')
566-
except Exception as ex:
567-
warnings.warn("VOTABLE parsing raised exception: {0}".format(ex))
568-
vo_tree = votable.parse(tf, pedantic=False, invalid='mask')
569-
elif invalid == 'raise':
570-
vo_tree = votable.parse(tf, pedantic=False, invalid='raise')
571-
else:
572-
raise ValueError("Invalid keyword 'invalid'. Must be raise, mask, or warn")
573-
574-
if get_catalog_names:
575-
return dict([(R.name, R) for R in vo_tree.resources])
576-
else:
577-
table_dict = OrderedDict()
578-
for t in vo_tree.iter_tables():
579-
if len(t.array) > 0:
580-
if t.ref is not None:
581-
name = vo_tree.get_table_by_id(t.ref).name
582-
else:
583-
name = t.name
584-
if name not in table_dict.keys():
585-
table_dict[name] = []
586-
table_dict[name] += [t.to_table()]
587-
for name in table_dict.keys():
588-
if len(table_dict[name]) > 1:
589-
table_dict[name] = tbl.vstack(table_dict[name])
590-
else:
591-
table_dict[name] = table_dict[name][0]
592-
return commons.TableList(table_dict)
593-
594-
except Exception as ex:
595-
self.response = response
596-
self.table_parse_error = ex
597-
raise TableParseError("Failed to parse VIZIER result! The raw response can be found "
598-
"in self.response, and the error in self.table_parse_error."
599-
" The attempted parsed result is in self.parsed_result.\n"
600-
"Exception: " + str(self.table_parse_error))
620+
if response.content[:5] == b'<?xml':
621+
try:
622+
return parse_vizier_votable(response.content, verbose=verbose,
623+
invalid=invalid,
624+
get_catalog_names=get_catalog_names)
625+
except Exception as ex:
626+
self.response = response
627+
self.table_parse_error = ex
628+
raise TableParseError("Failed to parse VIZIER result! The raw response can be found "
629+
"in self.response, and the error in self.table_parse_error."
630+
" The attempted parsed result is in self.parsed_result.\n"
631+
"Exception: " + str(self.table_parse_error))
632+
elif response.content[:5] == b'#\n# ':
633+
return parse_vizier_tsvfile(data, verbose=verbose)
634+
elif response.content[:6] == b'SIMPLE':
635+
return fits.open(BytesIO(response.content), ignore_missing_end=True)
601636

602637
@property
603638
def valid_keywords(self):
@@ -611,6 +646,69 @@ def valid_keywords(self):
611646

612647
return self._valid_keyword_dict
613648

649+
def parse_vizier_tsvfile(data, verbose=False):
650+
"""
651+
Parse a Vizier-generated list of tsv data tables into a list of astropy
652+
Tables.
653+
654+
Parameters
655+
----------
656+
data : ascii str
657+
An ascii string containing the vizier-formatted list of tables
658+
"""
659+
660+
# http://stackoverflow.com/questions/4664850/find-all-occurrences-of-a-substring-in-python
661+
split_indices = [m.start() for m in re.finditer('\n\n#', data)]
662+
# we want to slice out chunks of the file each time
663+
split_limits = zip(split_indices[:-1], split_indices[1:])
664+
tables = [ascii.read(BytesIO(data[a:b]), format='fast_tab', delimiter='\t',
665+
header_start=0, comment="#") for
666+
a,b in split_limits]
667+
return tables
668+
669+
def parse_vizier_votable(data, verbose=False, invalid='warn',
670+
get_catalog_names=False):
671+
"""
672+
Given a votable as string, parse it into tables
673+
"""
674+
if not verbose:
675+
commons.suppress_vo_warnings()
676+
677+
tf = BytesIO(data)
678+
679+
if invalid == 'mask':
680+
vo_tree = votable.parse(tf, pedantic=False, invalid='mask')
681+
elif invalid == 'warn':
682+
try:
683+
vo_tree = votable.parse(tf, pedantic=False, invalid='raise')
684+
except Exception as ex:
685+
warnings.warn("VOTABLE parsing raised exception: {0}".format(ex))
686+
vo_tree = votable.parse(tf, pedantic=False, invalid='mask')
687+
elif invalid == 'raise':
688+
vo_tree = votable.parse(tf, pedantic=False, invalid='raise')
689+
else:
690+
raise ValueError("Invalid keyword 'invalid'. Must be raise, mask, or warn")
691+
692+
if get_catalog_names:
693+
return dict([(R.name, R) for R in vo_tree.resources])
694+
else:
695+
table_dict = OrderedDict()
696+
for t in vo_tree.iter_tables():
697+
if len(t.array) > 0:
698+
if t.ref is not None:
699+
name = vo_tree.get_table_by_id(t.ref).name
700+
else:
701+
name = t.name
702+
if name not in table_dict.keys():
703+
table_dict[name] = []
704+
table_dict[name] += [t.to_table()]
705+
for name in table_dict.keys():
706+
if len(table_dict[name]) > 1:
707+
table_dict[name] = tbl.vstack(table_dict[name])
708+
else:
709+
table_dict[name] = table_dict[name][0]
710+
return commons.TableList(table_dict)
711+
614712

615713
def _parse_angle(angle):
616714
"""

0 commit comments

Comments
 (0)