Skip to content

Commit 9e64810

Browse files
author
Jeremy Phelps
committed
Conditionally modify the context object instead of conditionally
returning early.
1 parent 13012b9 commit 9e64810

File tree

1 file changed

+39
-168
lines changed

1 file changed

+39
-168
lines changed

pydruid/client.py

Lines changed: 39 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def _prepare_url_headers_and_body(self, query):
5353
username_password = \
5454
b64encode(bytes('{}:{}'.format(self.username, self.password)))
5555
headers['Authorization'] = 'Basic {}'.format(username_password)
56-
56+
5757
return headers, querystr, url
5858

5959
def _post(self, query):
@@ -71,10 +71,8 @@ def _post(self, query):
7171

7272
def topn(self, **kwargs):
7373
"""
74-
A TopN query returns a set of the values in a given dimension,
75-
sorted by a specified metric. Conceptually, a topN can be
76-
thought of as an approximate GroupByQuery over a single
77-
dimension with an Ordering spec. TopNs are
74+
A TopN query returns a set of the values in a given dimension, sorted by a specified metric. Conceptually, a
75+
topN can be thought of as an approximate GroupByQuery over a single dimension with an Ordering spec. TopNs are
7876
faster and more resource efficient than GroupBy for this use case.
7977
8078
Required key/value pairs:
@@ -83,8 +81,7 @@ def topn(self, **kwargs):
8381
:param str granularity: Aggregate data by hour, day, minute, etc.,
8482
:param intervals: ISO-8601 intervals of data to query
8583
:type intervals: str or list
86-
:param dict aggregations: A map from aggregator name to one of
87-
the pydruid.utils.aggregators e.g., doublesum
84+
:param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
8885
:param str dimension: Dimension to run the query against
8986
:param str metric: Metric over which to sort the specified dimension by
9087
:param int threshold: How many of the top items to return
@@ -94,10 +91,8 @@ def topn(self, **kwargs):
9491
9592
Optional key/value pairs:
9693
97-
:param pydruid.utils.filters.Filter filter: Indicates which rows
98-
of data to include in the query
99-
:param post_aggregations: A dict with string key = 'post_aggregator_name',
100-
and value pydruid.utils.PostAggregator
94+
:param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
95+
:param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
10196
:param dict context: A dict of query context options
10297
10398
Example:
@@ -117,35 +112,30 @@ def topn(self, **kwargs):
117112
context={"timeout": 1000}
118113
)
119114
>>> print top
120-
>>> [{'timestamp': '2013-06-14T00:00:00.000Z',
121-
'result': [{'count': 22.0, 'user': "cool_user"}}]}]
115+
>>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': [{'count': 22.0, 'user': "cool_user"}}]}]
122116
"""
123117
query = self.query_builder.topn(kwargs)
124118
return self._post(query)
125119

126120
def timeseries(self, **kwargs):
127121
"""
128-
A timeseries query returns the values of the requested metrics (in aggregate)
129-
for each timestamp.
122+
A timeseries query returns the values of the requested metrics (in aggregate) for each timestamp.
130123
131124
Required key/value pairs:
132125
133126
:param str datasource: Data source to query
134127
:param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
135128
:param intervals: ISO-8601 intervals for which to run the query on
136129
:type intervals: str or list
137-
:param dict aggregations: A map from aggregator name to one of the
138-
``pydruid.utils.aggregators`` e.g., ``doublesum``
130+
:param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
139131
140132
:return: The query result
141133
:rtype: Query
142134
143135
Optional key/value pairs:
144136
145-
:param pydruid.utils.filters.Filter filter: Indicates which rows of
146-
data to include in the query
147-
:param post_aggregations: A dict with string key =
148-
'post_aggregator_name', and value pydruid.utils.PostAggregator
137+
:param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
138+
:param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
149139
:param dict context: A dict of query context options
150140
151141
Example:
@@ -157,49 +147,39 @@ def timeseries(self, **kwargs):
157147
datasource=twitterstream,
158148
granularity='hour',
159149
intervals='2013-06-14/pt1h',
160-
aggregations=\
161-
{"count": doublesum("count"), "rows": count("rows")},
162-
post_aggregations=\
163-
{'percent': (Field('count') / Field('rows')) * Const(100))},
150+
aggregations={"count": doublesum("count"), "rows": count("rows")},
151+
post_aggregations={'percent': (Field('count') / Field('rows')) * Const(100))},
164152
context={"timeout": 1000}
165153
)
166154
>>> print counts
167-
>>> [{'timestamp': '2013-06-14T00:00:00.000Z',
168-
'result': {'count': 9619.0, 'rows': 8007,
169-
'percent': 120.13238416385663}}]
155+
>>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'count': 9619.0, 'rows': 8007, 'percent': 120.13238416385663}}]
170156
"""
171157
query = self.query_builder.timeseries(kwargs)
172158
return self._post(query)
173159

174160
def groupby(self, **kwargs):
175161
"""
176-
A group-by query groups a results set (the requested aggregate
177-
metrics) by the specified dimension(s).
162+
A group-by query groups a results set (the requested aggregate metrics) by the specified dimension(s).
178163
179164
Required key/value pairs:
180165
181166
:param str datasource: Data source to query
182167
:param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
183168
:param intervals: ISO-8601 intervals for which to run the query on
184169
:type intervals: str or list
185-
:param dict aggregations: A map from aggregator name to one of the
186-
``pydruid.utils.aggregators`` e.g., ``doublesum``
170+
:param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
187171
:param list dimensions: The dimensions to group by
188172
189173
:return: The query result
190174
:rtype: Query
191175
192176
Optional key/value pairs:
193177
194-
:param pydruid.utils.filters.Filter filter: Indicates which rows of
195-
data to include in the query
196-
:param pydruid.utils.having.Having having: Indicates which groups
197-
in results set of query to keep
198-
:param post_aggregations: A dict with string key = 'post_aggregator_name',
199-
and value pydruid.utils.PostAggregator
178+
:param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
179+
:param pydruid.utils.having.Having having: Indicates which groups in results set of query to keep
180+
:param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
200181
:param dict context: A dict of query context options
201-
:param dict limit_spec: A dict of parameters defining how to limit
202-
the rows returned, as specified in the Druid api documentation
182+
:param dict limit_spec: A dict of parameters defining how to limit the rows returned, as specified in the Druid api documentation
203183
204184
Example:
205185
@@ -222,25 +202,8 @@ def groupby(self, **kwargs):
222202
)
223203
>>> for k in range(2):
224204
... print group[k]
225-
>>> {
226-
'timestamp': '2013-10-04T00:00:00.000Z',
227-
'version': 'v1',
228-
'event': {
229-
'count': 1.0,
230-
'user_name': 'user_1',
231-
'reply_to_name': 'user_2',
232-
}
233-
}
234-
>>> {
235-
'timestamp': '2013-10-04T00:00:00.000Z',
236-
'version': 'v1',
237-
'event': {
238-
'count': 1.0,
239-
'user_name': 'user_2',
240-
'reply_to_name':
241-
'user_3',
242-
}
243-
}
205+
>>> {'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': {'count': 1.0, 'user_name': 'user_1', 'reply_to_name': 'user_2'}}
206+
>>> {'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': {'count': 1.0, 'user_name': 'user_2', 'reply_to_name': 'user_3'}}
244207
"""
245208
query = self.query_builder.groupby(kwargs)
246209
return self._post(query)
@@ -274,17 +237,11 @@ def segment_metadata(self, **kwargs):
274237
.. code-block:: python
275238
:linenos:
276239
277-
>>> meta = client.segment_metadata(
278-
datasource='twitterstream', intervals = '2013-10-04/pt1h')
240+
>>> meta = client.segment_metadata(datasource='twitterstream', intervals = '2013-10-04/pt1h')
279241
>>> print meta[0].keys()
280242
>>> ['intervals', 'id', 'columns', 'size']
281243
>>> print meta[0]['columns']['tweet_length']
282-
>>> {
283-
'errorMessage': None,
284-
'cardinality': None,
285-
'type': 'FLOAT',
286-
'size': 30908008,
287-
}
244+
>>> {'errorMessage': None, 'cardinality': None, 'type': 'FLOAT', 'size': 30908008}
288245
289246
"""
290247
query = self.query_builder.segment_metadata(kwargs)
@@ -312,13 +269,7 @@ def time_boundary(self, **kwargs):
312269
313270
>>> bound = client.time_boundary(datasource='twitterstream')
314271
>>> print bound
315-
>>> [{
316-
'timestamp': '2011-09-14T15:00:00.000Z',
317-
'result': {
318-
'minTime': '2011-09-14T15:00:00.000Z',
319-
'maxTime': '2014-03-04T23:44:00.000Z',
320-
}
321-
}]
272+
>>> [{'timestamp': '2011-09-14T15:00:00.000Z', 'result': {'minTime': '2011-09-14T15:00:00.000Z', 'maxTime': '2014-03-04T23:44:00.000Z'}}]
322273
"""
323274
query = self.query_builder.time_boundary(kwargs)
324275
return self._post(query)
@@ -337,12 +288,9 @@ def select(self, **kwargs):
337288
338289
Optional key/value pairs:
339290
340-
:param pydruid.utils.filters.Filter filter: Indicates which rows of
341-
data to include in the query
342-
:param list dimensions: The list of dimensions to select. If left
343-
empty, all dimensions are returned
344-
:param list metrics: The list of metrics to select. If left empty,
345-
all metrics are returned
291+
:param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
292+
:param list dimensions: The list of dimensions to select. If left empty, all dimensions are returned
293+
:param list metrics: The list of metrics to select. If left empty, all metrics are returned
346294
:param dict context: A dict of query context options
347295
348296
:return: The query result
@@ -360,22 +308,8 @@ def select(self, **kwargs):
360308
paging_spec={'pagingIdentifies': {}, 'threshold': 1},
361309
context={"timeout": 1000}
362310
)
363-
>>> print(raw_data)
364-
>>> [{
365-
'timestamp': '2013-06-14T00:00:00.000Z',
366-
'result': {
367-
'pagingIdentifiers': {
368-
'twitterstream_...08:00:00.000Z_v1': 1,
369-
'events': [{
370-
'segmentId': 'twitterstr...000Z_v1',
371-
'offset': 0,
372-
'event': {
373-
'timestamp': '2013-06-14T00:00:00.000Z',
374-
'dim': 'value',
375-
}
376-
}]
377-
}
378-
}]
311+
>>> print raw_data
312+
>>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'pagingIdentifiers': {'twitterstream_2013-06-14T00:00:00.000Z_2013-06-15T00:00:00.000Z_2013-06-15T08:00:00.000Z_v1': 1, 'events': [{'segmentId': 'twitterstream_2013-06-14T00:00:00.000Z_2013-06-15T00:00:00.000Z_2013-06-15T08:00:00.000Z_v1', 'offset': 0, 'event': {'timestamp': '2013-06-14T00:00:00.000Z', 'dim': 'value'}}]}}]
379313
"""
380314
query = self.query_builder.select(kwargs)
381315
return self._post(query)
@@ -388,8 +322,7 @@ def export_tsv(self, dest_path):
388322
Use Query.export_tsv() method instead.
389323
"""
390324
if self.query_builder.last_query is None:
391-
raise AttributeError(
392-
"There was no query executed by this client yet. Can't export!")
325+
raise AttributeError("There was no query executed by this client yet. Can't export!")
393326
else:
394327
return self.query_builder.last_query.export_tsv(dest_path)
395328

@@ -401,17 +334,15 @@ def export_pandas(self):
401334
Use Query.export_pandas() method instead
402335
"""
403336
if self.query_builder.last_query is None:
404-
raise AttributeError(
405-
"There was no query executed by this client yet. Can't export!")
337+
raise AttributeError("There was no query executed by this client yet. Can't export!")
406338
else:
407339
return self.query_builder.last_query.export_pandas()
408340

409341

410342
class PyDruid(BaseDruidClient):
411343
"""
412-
PyDruid contains the functions for creating and executing Druid queries.
413-
Returns Query objects that can be used for exporting query results
414-
into TSV files or pandas.DataFrame objects for subsequent analysis.
344+
PyDruid contains the functions for creating and executing Druid queries. Returns Query objects that can be used
345+
for exporting query results into TSV files or pandas.DataFrame objects for subsequent analysis.
415346
416347
:param str url: URL of Broker node in the Druid cluster
417348
:param str endpoint: Endpoint that Broker listens for queries on
@@ -460,18 +391,8 @@ class PyDruid(BaseDruidClient):
460391
}
461392
462393
>>> print top.result
463-
>>> [{
464-
'timestamp': '2013-10-04T00:00:00.000Z',
465-
'result': [
466-
{
467-
'count': 7.0,
468-
'user_name': 'user_1',
469-
},
470-
{
471-
'count': 6.0,
472-
'user_name': 'user_2',
473-
},
474-
]}]
394+
>>> [{'timestamp': '2013-10-04T00:00:00.000Z',
395+
'result': [{'count': 7.0, 'user_name': 'user_1'}, {'count': 6.0, 'user_name': 'user_2'}]}]
475396
476397
>>> df = top.export_pandas()
477398
>>> print df
@@ -484,10 +405,9 @@ def __init__(self, url, endpoint):
484405

485406
def ssl_context(self):
486407
ctx = ssl.create_default_context()
487-
if not self.ignore_certificate_errors:
488-
return ctx
489-
ctx.check_hostname = False
490-
ctx.verify_mode = ssl.CERT_NONE
408+
if self.ignore_certificate_errors:
409+
ctx.check_hostname = False
410+
ctx.verify_mode = ssl.CERT_NONE
491411
return ctx
492412

493413
def _post(self, query):
@@ -512,52 +432,3 @@ def _post(self, query):
512432
else:
513433
query.parse(data)
514434
return query
515-
516-
def scan(self, **kwargs):
517-
"""
518-
A scan query returns raw Druid rows
519-
520-
Required key/value pairs:
521-
522-
:param str datasource: Data source to query
523-
:param str granularity: Time bucket to aggregate data by hour, day, minute, etc.
524-
:param int limit: The maximum number of rows to return
525-
:param intervals: ISO-8601 intervals for which to run the query on
526-
:type intervals: str or list
527-
528-
Optional key/value pairs:
529-
530-
:param pydruid.utils.filters.Filter filter: Indicates which rows of
531-
data to include in the query
532-
:param list dimensions: The list of dimensions to select. If left
533-
empty, all dimensions are returned
534-
:param list metrics: The list of metrics to select. If left empty,
535-
all metrics are returned
536-
:param dict context: A dict of query context options
537-
538-
:return: The query result
539-
:rtype: Query
540-
541-
Example:
542-
543-
.. code-block:: python
544-
:linenos:
545-
546-
>>> raw_data = client.scan(
547-
datasource=twitterstream,
548-
granularity='all',
549-
intervals='2013-06-14/pt1h',
550-
limit=1,
551-
context={"timeout": 1000}
552-
)
553-
>>> print raw_data
554-
>>> [{
555-
u'segmentId': u'zzzz',
556-
u'columns': [u'__time', 'status', 'region'],
557-
'events': [{
558-
u'status': u'ok', 'region': u'SF', u'__time': 1509494400000,
559-
}]
560-
}]
561-
"""
562-
query = self.query_builder.scan(kwargs)
563-
return self._post(query)

0 commit comments

Comments
 (0)