Skip to content
This repository was archived by the owner on Jun 19, 2025. It is now read-only.

Commit 2c68b6e

Browse files
author
ARF
committed
introduce query transformer infrastructure & sample implementations
1 parent 453ec25 commit 2c68b6e

File tree

2 files changed

+351
-0
lines changed

2 files changed

+351
-0
lines changed

bquery/ctable.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,133 @@
88
import os
99
from bquery.ctable_ext import \
1010
SUM, COUNT, COUNT_NA, COUNT_DISTINCT, SORTED_COUNT_DISTINCT
11+
import ast
12+
from types import ModuleType
13+
try:
14+
import meta
15+
except ImportError:
16+
pass
1117

1218

1319
class ctable(bcolz.ctable):
20+
###
21+
### Overriding bcolz.ctable methods
22+
###
23+
24+
def __init__(self, *args, **kwargs):
25+
self._transformers = []
26+
self.intermediary_cparams = None
27+
super(ctable, self).__init__(*args, **kwargs)
28+
29+
def eval(self, expression, **kwargs):
30+
# TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
31+
user_dict = kwargs.pop('user_dict', {})
32+
if len(self._transformers) > 0 \
33+
and type(expression) is str:
34+
expression, user_dict = self.transform_query(expression)
35+
# TODO: clean this up/simplify once Blosc/bcolz#162 is resolved
36+
if expression in ['True', 'False']:
37+
out_flavor = kwargs.pop('out_flavor', None)
38+
if out_flavor is None:
39+
out_flavor = bcolz.defaults.eval_out_flavor
40+
if out_flavor == 'numpy':
41+
np.array([expression=='True']*len(self), dtype=dtype) \
42+
.view(np.ndarray)
43+
else:
44+
return bcolz.carray([expression=='True']*len(self))
45+
46+
if len(user_dict) == 0:
47+
user_dict.update({key: self.cols[key] for key in self.cols})
48+
return bcolz.eval(expression, user_dict=user_dict, **kwargs)
49+
50+
def where(self, expression, outcols=None, limit=None, skip=0):
51+
# if query transformers are defined, transform query
52+
if len(self._transformers) > 0 \
53+
and type(expression) is str:
54+
expression, user_dict = self.transform_query(expression)
55+
cparams = kwargs.pop('cparams', self.intermediary_cparams)
56+
expression = self.eval(expression, user_dict=user_dict,
57+
cparams=cparams)
58+
return super(ctable, self).where(expression, outcols=outcols,
59+
limit=limit, skip=skip)
60+
61+
def __getitem__(self, key):
62+
# if query transformers are defined, transform query
63+
if len(self._transformers) > 0 \
64+
and isinstance(key, bcolz.py2help._strtypes) \
65+
and key not in self.names:
66+
# key is not a column name, try to evaluate
67+
key, user_dict = self.transform_query(key)
68+
# TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
69+
user_dict.update({key: self.cols[key] for key in self.cols})
70+
arr = bcolz.eval(key, user_dict=user_dict, depth=3,
71+
cparams=self.intermediary_cparams)
72+
if arr.dtype.type != np.bool_:
73+
raise IndexError(
74+
"`key` %s does not represent a boolean "
75+
"expression" % key)
76+
# TODO: clean this up once Blosc/bcolz#162 is resolved
77+
elif arr == False:
78+
dtype = np.dtype([(name, self.cols[name].dtype)
79+
for name in self.names])
80+
return np.empty(0, dtype=dtype).view(np.ndarray)
81+
return self._where(arr)
82+
83+
return super(ctable, self).__getitem__(key)
84+
85+
###
86+
### Extending bcolz.ctable
87+
###
88+
89+
@property
90+
def transformers(self):
91+
"""The list of :class:`QueryTransformer` instances that are applied
92+
automatically to all query strings."""
93+
94+
return self._transformers
95+
96+
@transformers.setter
97+
def transformers(self, value):
98+
# enable query transformation if the required modules are installed
99+
try:
100+
if not isinstance(meta, ModuleType):
101+
raise NameError()
102+
except NameError:
103+
raise RuntimeError(
104+
'Query transformation requires the module `meta`.')
105+
return
106+
self._transformers = value
107+
108+
def transform_query(self, query, user_dict=None):
109+
"""transform_query(query, user_dict=None)
110+
111+
Applies the :class:`QueryTransformer` instances configured in
112+
:attr:`self.transformers` to the `query`.
113+
114+
Parameters
115+
----------
116+
query : string
117+
A string forming a boolean expression, like
118+
"(col1 == 'Example') & (col2 != 'Text')".
119+
120+
Returns
121+
-------
122+
out : (string, dict)
123+
A tuple containing the transformed query string and a dictionary
124+
where the variables added by the transformer can be found by name.
125+
"""
126+
127+
ast_tree = ast.parse(query)
128+
if not ast_tree:
129+
return query
130+
131+
if user_dict is None:
132+
user_dict = {}
133+
for transformer in self._transformers:
134+
ast_tree = transformer.apply(self, ast_tree, user_dict)
135+
136+
return meta.dump_python_source(ast_tree).strip(), user_dict
137+
14138
def cache_valid(self, col):
15139
"""
16140
Checks whether the column has a factorization that exists and is not older than the source

bquery/transformers.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
from ast import NodeTransformer
2+
from ast import (Eq, NotEq, In, NotIn, BitOr, BitAnd,
3+
Compare, BinOp,
4+
Name, Load, Str,
5+
)
6+
import ast
7+
import copy
8+
9+
import bcolz
10+
11+
__all__ = ['standard_transformers',
12+
'QueryTransformer',
13+
'InOperatorTransformer',
14+
'CachedFactorOptimizer',
15+
'TrivialBooleanExpressionsOptimizer',
16+
]
17+
18+
class QueryTransformer(NodeTransformer):
19+
"""A :class:`ast.NodeTransformer` subclass that walks the abstract syntax tree
20+
of the query and allows modification of nodes.
21+
22+
The user-provided dictionary of the variables in expression that is passed
23+
to the `bcolz.eval()` function can accessed and modified as
24+
:attr:`self.user_dict`. The calling :class:`ctable` instance is available
25+
as :attr:`self.ctable_`.
26+
27+
The `QueryTransformer` will walk the AST and use the return value of the
28+
visitor methods to replace or remove the old node. If the return value of
29+
the visitor method is ``None``, the node will be removed from its location,
30+
otherwise it is replaced with the return value. The return value may be the
31+
original node in which case no replacement takes place.
32+
33+
Keep in mind that if the node you're operating on has child nodes you must
34+
either transform the child nodes yourself or call the :meth:`generic_visit`
35+
method for the node first.
36+
37+
For nodes that were part of a collection of statements (that applies to all
38+
statement nodes), the visitor may also return a list of nodes rather than
39+
just a single node.
40+
41+
The visitor functions for the nodes are ``'visit_'`` + class name of the
42+
node. So a `TryFinally` node visit function would be `visit_TryFinally`.
43+
If no visitor function exists for a node (return value `None`) the
44+
`generic_visit` visitor is used instead.
45+
46+
Usually you use the transformer like this::
47+
node = QueryTransformer().apply(ctable_, node, user_dict)
48+
"""
49+
50+
def apply(self, ctable_, node, user_dict):
51+
self.user_dict = user_dict
52+
self.ctable_ = ctable_
53+
return self.visit(node)
54+
55+
56+
class InOperatorTransformer(QueryTransformer):
57+
"""A :class:`QueryTransformer` that converts comparisons with `in` and
58+
`not in` operators into expressions using `==` and `!=`.
59+
60+
Example:
61+
`my_col in ['ABC', 'DEF']` is transformed into
62+
`(my_col == 'ABC') | (my_col == 'DEF')`
63+
64+
This is useful as Numexpr currently does not support `in` operators."""
65+
66+
def visit_Compare(self, node):
67+
# first transform all child nodes if necessary
68+
node = self.generic_visit(node)
69+
70+
if not isinstance(node.ops[0], (In, NotIn)):
71+
return node
72+
73+
# replace `in` comparisions with empty comparison list
74+
if len(node.comparators[0].elts) == 0:
75+
if isinstance(node.ops[0], In):
76+
return Name(id='False', ctx=Load())
77+
else:
78+
return Name(id='True', ctx=Load())
79+
80+
compare_op , binop_op = self.get_operators(node.ops[0])
81+
# rewrite the first element in list using `==` / `!=` comparison
82+
eq_expr = Compare(
83+
left = node.left,
84+
ops = [compare_op],
85+
comparators = [node.comparators[0].elts[0]])
86+
# join similar comparisons for all othe elements using the appropriate
87+
# binary operator, i.e. | or &
88+
for element in node.comparators[0].elts[1:]:
89+
eq_expr = BinOp(
90+
left = eq_expr,
91+
op = binop_op,
92+
right = Compare(
93+
left = copy.copy(node.left),
94+
ops = [compare_op],
95+
comparators = [element]
96+
)
97+
)
98+
return eq_expr
99+
100+
def get_operators(self, op):
101+
if isinstance(op, In):
102+
return Eq(), BitOr()
103+
else:
104+
return NotEq(), BitAnd()
105+
106+
107+
class CachedFactorOptimizer(QueryTransformer):
108+
"""A :class:`QueryTransformer` that converts comparisons containing
109+
columns with cached factors into comparisons using the factor instead.
110+
111+
This potentially speeds up queries significantly:
112+
- By detecting queries that will not return any values without
113+
scanning the entire column.
114+
- By evaluating the comparison on the integer typed factor rather than
115+
a column of a datatype that is more costly to compare, e.g. String.
116+
117+
The `CachedFactorOptimizer` should be followed by the
118+
:class:`TrivialBooleanExpressionsOptimizer` to obtain the full benefit."""
119+
120+
def visit_Compare(self, node):
121+
# first transform all child nodes if necessary
122+
node = self.generic_visit(node)
123+
124+
# check we have a simple comparison
125+
if len(node.comparators) != 1 or len(node.ops) != 1:
126+
return node
127+
128+
# TODO: we currently do not sort the values of the cached
129+
# factors. Therefore we cannot optimize inequalities
130+
elif not isinstance(node.ops[0], (Eq, NotEq)):
131+
return node
132+
133+
# col_name == 'value'
134+
if isinstance(node.left, Name):
135+
var = node.left
136+
val = node.comparators[0]
137+
# 'value' == col_name
138+
elif isinstance(node.comparators[0], Name):
139+
var = node.comparators[0]
140+
val = node.left
141+
# we can accelerate expressions that contain at least one column ref
142+
else:
143+
return node
144+
145+
col = var.id
146+
if not self.ctable_.cache_valid(col):
147+
return node
148+
149+
# find factor id for requested value
150+
col_values_rootdir = self.ctable_[col].rootdir + '.values'
151+
carray_values = bcolz.carray(rootdir=col_values_rootdir,
152+
mode='r')
153+
idx = None
154+
# deal with strings and number nodes
155+
val_field = 's' if isinstance(val, Str) else 'n'
156+
for index, value in enumerate(carray_values.iter()):
157+
if value == getattr(val, val_field):
158+
idx = index
159+
break
160+
# value not in cached factorisation
161+
if idx is None:
162+
if isinstance(node.ops[0], Eq):
163+
return Name(id='False', ctx=Load())
164+
else:
165+
return Name(id='True', ctx=Load())
166+
167+
# found value in cached factorisation:
168+
# rewrite the comparison expression
169+
setattr(val, val_field, idx)
170+
var.id = 'bquery_factors_%s' % col
171+
# load the factor for later use
172+
if not self.user_dict.has_key('bquery_factors_%s' % col):
173+
col_factor_rootdir = self.ctable_[col].rootdir + '.factor'
174+
self.user_dict['bquery_factors_%s' % col] = \
175+
bcolz.carray(rootdir=col_factor_rootdir, mode='r')
176+
return node
177+
178+
179+
class TrivialBooleanExpressionsOptimizer(QueryTransformer):
180+
"""A :class:`QueryTransformer` that simplifies boolean expression
181+
containing subparts that are trivial boolean expressions.
182+
183+
Example:
184+
`(my_col == 'ABC') | (False)` is transformed into
185+
`False`
186+
187+
This speeds up queries that can be logically determined to never return
188+
any entries are not explicitly evaluated against the database."""
189+
190+
def visit_BinOp(self, node):
191+
# first transform all child nodes if necessary
192+
node = self.generic_visit(node)
193+
194+
# only optimize & and | expressions
195+
if not isinstance(node.op, (BitOr, BitAnd)):
196+
return node
197+
198+
if isinstance(node.left, Name):
199+
name_operand = node.left
200+
other_operand = node.right
201+
elif isinstance(node.right, Name):
202+
name_operand = node.right
203+
other_operand = node.left
204+
# no Name operand means no trivial boolean expressions
205+
else:
206+
return node
207+
208+
# the Name operand is not a trivial boolean expression but a variable
209+
if name_operand.id not in ['True', 'False']:
210+
return node
211+
212+
# simplify comparisons containing trivial boolean expression
213+
if isinstance(node.op, BitOr):
214+
if name_operand.id == 'True':
215+
return name_operand
216+
else:
217+
return other_operand
218+
else:
219+
if name_operand.id == 'False':
220+
return name_operand
221+
else:
222+
return other_operand
223+
224+
# provides a convenient short-cut for configuring a set of standard transformers
225+
standard_transformers = [InOperatorTransformer(),
226+
TrivialBooleanExpressionsOptimizer(),
227+
]

0 commit comments

Comments
 (0)