@@ -53,7 +53,7 @@ def _prepare_url_headers_and_body(self, query):
5353 username_password = \
5454 b64encode (bytes ('{}:{}' .format (self .username , self .password )))
5555 headers ['Authorization' ] = 'Basic {}' .format (username_password )
56-
56+
5757 return headers , querystr , url
5858
5959 def _post (self , query ):
@@ -71,10 +71,8 @@ def _post(self, query):
7171
7272 def topn (self , ** kwargs ):
7373 """
74- A TopN query returns a set of the values in a given dimension,
75- sorted by a specified metric. Conceptually, a topN can be
76- thought of as an approximate GroupByQuery over a single
77- dimension with an Ordering spec. TopNs are
74+ A TopN query returns a set of the values in a given dimension, sorted by a specified metric. Conceptually, a
75+ topN can be thought of as an approximate GroupByQuery over a single dimension with an Ordering spec. TopNs are
7876 faster and more resource efficient than GroupBy for this use case.
7977
8078 Required key/value pairs:
@@ -83,8 +81,7 @@ def topn(self, **kwargs):
8381 :param str granularity: Aggregate data by hour, day, minute, etc.,
8482 :param intervals: ISO-8601 intervals of data to query
8583 :type intervals: str or list
86- :param dict aggregations: A map from aggregator name to one of
87- the pydruid.utils.aggregators e.g., doublesum
84+ :param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
8885 :param str dimension: Dimension to run the query against
8986 :param str metric: Metric over which to sort the specified dimension by
9087 :param int threshold: How many of the top items to return
@@ -94,10 +91,8 @@ def topn(self, **kwargs):
9491
9592 Optional key/value pairs:
9693
97- :param pydruid.utils.filters.Filter filter: Indicates which rows
98- of data to include in the query
99- :param post_aggregations: A dict with string key = 'post_aggregator_name',
100- and value pydruid.utils.PostAggregator
94+ :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
95+ :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
10196 :param dict context: A dict of query context options
10297
10398 Example:
@@ -117,35 +112,30 @@ def topn(self, **kwargs):
117112 context={"timeout": 1000}
118113 )
119114 >>> print top
120- >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
121- 'result': [{'count': 22.0, 'user': "cool_user"}}]}]
115+ >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': [{'count': 22.0, 'user': "cool_user"}}]}]
122116 """
123117 query = self .query_builder .topn (kwargs )
124118 return self ._post (query )
125119
126120 def timeseries (self , ** kwargs ):
127121 """
128- A timeseries query returns the values of the requested metrics (in aggregate)
129- for each timestamp.
122+ A timeseries query returns the values of the requested metrics (in aggregate) for each timestamp.
130123
131124 Required key/value pairs:
132125
133126 :param str datasource: Data source to query
134127 :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
135128 :param intervals: ISO-8601 intervals for which to run the query on
136129 :type intervals: str or list
137- :param dict aggregations: A map from aggregator name to one of the
138- ``pydruid.utils.aggregators`` e.g., ``doublesum``
130+ :param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
139131
140132 :return: The query result
141133 :rtype: Query
142134
143135 Optional key/value pairs:
144136
145- :param pydruid.utils.filters.Filter filter: Indicates which rows of
146- data to include in the query
147- :param post_aggregations: A dict with string key =
148- 'post_aggregator_name', and value pydruid.utils.PostAggregator
137+ :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
138+ :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
149139 :param dict context: A dict of query context options
150140
151141 Example:
@@ -157,49 +147,39 @@ def timeseries(self, **kwargs):
157147 datasource=twitterstream,
158148 granularity='hour',
159149 intervals='2013-06-14/pt1h',
160- aggregations=\
161- {"count": doublesum("count"), "rows": count("rows")},
162- post_aggregations=\
163- {'percent': (Field('count') / Field('rows')) * Const(100))},
150+ aggregations={"count": doublesum("count"), "rows": count("rows")},
151+ post_aggregations={'percent': (Field('count') / Field('rows')) * Const(100))},
164152 context={"timeout": 1000}
165153 )
166154 >>> print counts
167- >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
168- 'result': {'count': 9619.0, 'rows': 8007,
169- 'percent': 120.13238416385663}}]
155+ >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'count': 9619.0, 'rows': 8007, 'percent': 120.13238416385663}}]
170156 """
171157 query = self .query_builder .timeseries (kwargs )
172158 return self ._post (query )
173159
174160 def groupby (self , ** kwargs ):
175161 """
176- A group-by query groups a results set (the requested aggregate
177- metrics) by the specified dimension(s).
162+ A group-by query groups a results set (the requested aggregate metrics) by the specified dimension(s).
178163
179164 Required key/value pairs:
180165
181166 :param str datasource: Data source to query
182167 :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
183168 :param intervals: ISO-8601 intervals for which to run the query on
184169 :type intervals: str or list
185- :param dict aggregations: A map from aggregator name to one of the
186- ``pydruid.utils.aggregators`` e.g., ``doublesum``
170+ :param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum
187171 :param list dimensions: The dimensions to group by
188172
189173 :return: The query result
190174 :rtype: Query
191175
192176 Optional key/value pairs:
193177
194- :param pydruid.utils.filters.Filter filter: Indicates which rows of
195- data to include in the query
196- :param pydruid.utils.having.Having having: Indicates which groups
197- in results set of query to keep
198- :param post_aggregations: A dict with string key = 'post_aggregator_name',
199- and value pydruid.utils.PostAggregator
178+ :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
179+ :param pydruid.utils.having.Having having: Indicates which groups in results set of query to keep
180+ :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator
200181 :param dict context: A dict of query context options
201- :param dict limit_spec: A dict of parameters defining how to limit
202- the rows returned, as specified in the Druid api documentation
182+ :param dict limit_spec: A dict of parameters defining how to limit the rows returned, as specified in the Druid api documentation
203183
204184 Example:
205185
@@ -222,25 +202,8 @@ def groupby(self, **kwargs):
222202 )
223203 >>> for k in range(2):
224204 ... print group[k]
225- >>> {
226- 'timestamp': '2013-10-04T00:00:00.000Z',
227- 'version': 'v1',
228- 'event': {
229- 'count': 1.0,
230- 'user_name': 'user_1',
231- 'reply_to_name': 'user_2',
232- }
233- }
234- >>> {
235- 'timestamp': '2013-10-04T00:00:00.000Z',
236- 'version': 'v1',
237- 'event': {
238- 'count': 1.0,
239- 'user_name': 'user_2',
240- 'reply_to_name':
241- 'user_3',
242- }
243- }
205+ >>> {'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': {'count': 1.0, 'user_name': 'user_1', 'reply_to_name': 'user_2'}}
206+ >>> {'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': {'count': 1.0, 'user_name': 'user_2', 'reply_to_name': 'user_3'}}
244207 """
245208 query = self .query_builder .groupby (kwargs )
246209 return self ._post (query )
@@ -274,17 +237,11 @@ def segment_metadata(self, **kwargs):
274237 .. code-block:: python
275238 :linenos:
276239
277- >>> meta = client.segment_metadata(
278- datasource='twitterstream', intervals = '2013-10-04/pt1h')
240+ >>> meta = client.segment_metadata(datasource='twitterstream', intervals = '2013-10-04/pt1h')
279241 >>> print meta[0].keys()
280242 >>> ['intervals', 'id', 'columns', 'size']
281243 >>> print meta[0]['columns']['tweet_length']
282- >>> {
283- 'errorMessage': None,
284- 'cardinality': None,
285- 'type': 'FLOAT',
286- 'size': 30908008,
287- }
244+ >>> {'errorMessage': None, 'cardinality': None, 'type': 'FLOAT', 'size': 30908008}
288245
289246 """
290247 query = self .query_builder .segment_metadata (kwargs )
@@ -312,13 +269,7 @@ def time_boundary(self, **kwargs):
312269
313270 >>> bound = client.time_boundary(datasource='twitterstream')
314271 >>> print bound
315- >>> [{
316- 'timestamp': '2011-09-14T15:00:00.000Z',
317- 'result': {
318- 'minTime': '2011-09-14T15:00:00.000Z',
319- 'maxTime': '2014-03-04T23:44:00.000Z',
320- }
321- }]
272+ >>> [{'timestamp': '2011-09-14T15:00:00.000Z', 'result': {'minTime': '2011-09-14T15:00:00.000Z', 'maxTime': '2014-03-04T23:44:00.000Z'}}]
322273 """
323274 query = self .query_builder .time_boundary (kwargs )
324275 return self ._post (query )
@@ -337,12 +288,9 @@ def select(self, **kwargs):
337288
338289 Optional key/value pairs:
339290
340- :param pydruid.utils.filters.Filter filter: Indicates which rows of
341- data to include in the query
342- :param list dimensions: The list of dimensions to select. If left
343- empty, all dimensions are returned
344- :param list metrics: The list of metrics to select. If left empty,
345- all metrics are returned
291+ :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query
292+ :param list dimensions: The list of dimensions to select. If left empty, all dimensions are returned
293+ :param list metrics: The list of metrics to select. If left empty, all metrics are returned
346294 :param dict context: A dict of query context options
347295
348296 :return: The query result
@@ -360,22 +308,8 @@ def select(self, **kwargs):
360308 paging_spec={'pagingIdentifies': {}, 'threshold': 1},
361309 context={"timeout": 1000}
362310 )
363- >>> print(raw_data)
364- >>> [{
365- 'timestamp': '2013-06-14T00:00:00.000Z',
366- 'result': {
367- 'pagingIdentifiers': {
368- 'twitterstream_...08:00:00.000Z_v1': 1,
369- 'events': [{
370- 'segmentId': 'twitterstr...000Z_v1',
371- 'offset': 0,
372- 'event': {
373- 'timestamp': '2013-06-14T00:00:00.000Z',
374- 'dim': 'value',
375- }
376- }]
377- }
378- }]
311+ >>> print raw_data
312+ >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'pagingIdentifiers': {'twitterstream_2013-06-14T00:00:00.000Z_2013-06-15T00:00:00.000Z_2013-06-15T08:00:00.000Z_v1': 1, 'events': [{'segmentId': 'twitterstream_2013-06-14T00:00:00.000Z_2013-06-15T00:00:00.000Z_2013-06-15T08:00:00.000Z_v1', 'offset': 0, 'event': {'timestamp': '2013-06-14T00:00:00.000Z', 'dim': 'value'}}]}}]
379313 """
380314 query = self .query_builder .select (kwargs )
381315 return self ._post (query )
@@ -388,8 +322,7 @@ def export_tsv(self, dest_path):
388322 Use Query.export_tsv() method instead.
389323 """
390324 if self .query_builder .last_query is None :
391- raise AttributeError (
392- "There was no query executed by this client yet. Can't export!" )
325+ raise AttributeError ("There was no query executed by this client yet. Can't export!" )
393326 else :
394327 return self .query_builder .last_query .export_tsv (dest_path )
395328
@@ -401,17 +334,15 @@ def export_pandas(self):
401334 Use Query.export_pandas() method instead
402335 """
403336 if self .query_builder .last_query is None :
404- raise AttributeError (
405- "There was no query executed by this client yet. Can't export!" )
337+ raise AttributeError ("There was no query executed by this client yet. Can't export!" )
406338 else :
407339 return self .query_builder .last_query .export_pandas ()
408340
409341
410342class PyDruid (BaseDruidClient ):
411343 """
412- PyDruid contains the functions for creating and executing Druid queries.
413- Returns Query objects that can be used for exporting query results
414- into TSV files or pandas.DataFrame objects for subsequent analysis.
344+ PyDruid contains the functions for creating and executing Druid queries. Returns Query objects that can be used
345+ for exporting query results into TSV files or pandas.DataFrame objects for subsequent analysis.
415346
416347 :param str url: URL of Broker node in the Druid cluster
417348 :param str endpoint: Endpoint that Broker listens for queries on
@@ -460,18 +391,8 @@ class PyDruid(BaseDruidClient):
460391 }
461392
462393 >>> print top.result
463- >>> [{
464- 'timestamp': '2013-10-04T00:00:00.000Z',
465- 'result': [
466- {
467- 'count': 7.0,
468- 'user_name': 'user_1',
469- },
470- {
471- 'count': 6.0,
472- 'user_name': 'user_2',
473- },
474- ]}]
394+ >>> [{'timestamp': '2013-10-04T00:00:00.000Z',
395+ 'result': [{'count': 7.0, 'user_name': 'user_1'}, {'count': 6.0, 'user_name': 'user_2'}]}]
475396
476397 >>> df = top.export_pandas()
477398 >>> print df
@@ -484,10 +405,9 @@ def __init__(self, url, endpoint):
484405
485406 def ssl_context (self ):
486407 ctx = ssl .create_default_context ()
487- if not self .ignore_certificate_errors :
488- return ctx
489- ctx .check_hostname = False
490- ctx .verify_mode = ssl .CERT_NONE
408+ if self .ignore_certificate_errors :
409+ ctx .check_hostname = False
410+ ctx .verify_mode = ssl .CERT_NONE
491411 return ctx
492412
493413 def _post (self , query ):
@@ -512,52 +432,3 @@ def _post(self, query):
512432 else :
513433 query .parse (data )
514434 return query
515-
516- def scan (self , ** kwargs ):
517- """
518- A scan query returns raw Druid rows
519-
520- Required key/value pairs:
521-
522- :param str datasource: Data source to query
523- :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.
524- :param int limit: The maximum number of rows to return
525- :param intervals: ISO-8601 intervals for which to run the query on
526- :type intervals: str or list
527-
528- Optional key/value pairs:
529-
530- :param pydruid.utils.filters.Filter filter: Indicates which rows of
531- data to include in the query
532- :param list dimensions: The list of dimensions to select. If left
533- empty, all dimensions are returned
534- :param list metrics: The list of metrics to select. If left empty,
535- all metrics are returned
536- :param dict context: A dict of query context options
537-
538- :return: The query result
539- :rtype: Query
540-
541- Example:
542-
543- .. code-block:: python
544- :linenos:
545-
546- >>> raw_data = client.scan(
547- datasource=twitterstream,
548- granularity='all',
549- intervals='2013-06-14/pt1h',
550- limit=1,
551- context={"timeout": 1000}
552- )
553- >>> print raw_data
554- >>> [{
555- u'segmentId': u'zzzz',
556- u'columns': [u'__time', 'status', 'region'],
557- 'events': [{
558- u'status': u'ok', 'region': u'SF', u'__time': 1509494400000,
559- }]
560- }]
561- """
562- query = self .query_builder .scan (kwargs )
563- return self ._post (query )
0 commit comments