|
| 1 | +from ast import NodeTransformer |
| 2 | +from ast import (Eq, NotEq, In, NotIn, BitOr, BitAnd, |
| 3 | + Compare, BinOp, |
| 4 | + Name, Load, Str, |
| 5 | + ) |
| 6 | +import ast |
| 7 | +import copy |
| 8 | + |
| 9 | +import bcolz |
| 10 | + |
| 11 | +__all__ = ['standard_transformers', |
| 12 | + 'QueryTransformer', |
| 13 | + 'InOperatorTransformer', |
| 14 | + 'CachedFactorOptimizer', |
| 15 | + 'TrivialBooleanExpressionsOptimizer', |
| 16 | + ] |
| 17 | + |
| 18 | +class QueryTransformer(NodeTransformer): |
| 19 | + """A :class:`ast.NodeTransformer` subclass that walks the abstract syntax tree |
| 20 | + of the query and allows modification of nodes. |
| 21 | + |
| 22 | + The user-provided dictionary of the variables in expression that is passed |
| 23 | + to the `bcolz.eval()` function can accessed and modified as |
| 24 | + :attr:`self.user_dict`. The calling :class:`ctable` instance is available |
| 25 | + as :attr:`self.ctable_`. |
| 26 | +
|
| 27 | + The `QueryTransformer` will walk the AST and use the return value of the |
| 28 | + visitor methods to replace or remove the old node. If the return value of |
| 29 | + the visitor method is ``None``, the node will be removed from its location, |
| 30 | + otherwise it is replaced with the return value. The return value may be the |
| 31 | + original node in which case no replacement takes place. |
| 32 | +
|
| 33 | + Keep in mind that if the node you're operating on has child nodes you must |
| 34 | + either transform the child nodes yourself or call the :meth:`generic_visit` |
| 35 | + method for the node first. |
| 36 | +
|
| 37 | + For nodes that were part of a collection of statements (that applies to all |
| 38 | + statement nodes), the visitor may also return a list of nodes rather than |
| 39 | + just a single node. |
| 40 | +
|
| 41 | + The visitor functions for the nodes are ``'visit_'`` + class name of the |
| 42 | + node. So a `TryFinally` node visit function would be `visit_TryFinally`. |
| 43 | + If no visitor function exists for a node (return value `None`) the |
| 44 | + `generic_visit` visitor is used instead. |
| 45 | +
|
| 46 | + Usually you use the transformer like this:: |
| 47 | + node = QueryTransformer().apply(ctable_, node, user_dict) |
| 48 | + """ |
| 49 | + |
| 50 | + def apply(self, ctable_, node, user_dict): |
| 51 | + self.user_dict = user_dict |
| 52 | + self.ctable_ = ctable_ |
| 53 | + return self.visit(node) |
| 54 | + |
| 55 | + |
| 56 | +class InOperatorTransformer(QueryTransformer): |
| 57 | + """A :class:`QueryTransformer` that converts comparisons with `in` and |
| 58 | + `not in` operators into expressions using `==` and `!=`. |
| 59 | +
|
| 60 | + Example: |
| 61 | + `my_col in ['ABC', 'DEF']` is transformed into |
| 62 | + `(my_col == 'ABC') | (my_col == 'DEF')` |
| 63 | + |
| 64 | + This is useful as Numexpr currently does not support `in` operators.""" |
| 65 | + |
| 66 | + def visit_Compare(self, node): |
| 67 | + # first transform all child nodes if necessary |
| 68 | + node = self.generic_visit(node) |
| 69 | + |
| 70 | + if not isinstance(node.ops[0], (In, NotIn)): |
| 71 | + return node |
| 72 | + |
| 73 | + # replace `in` comparisions with empty comparison list |
| 74 | + if len(node.comparators[0].elts) == 0: |
| 75 | + if isinstance(node.ops[0], In): |
| 76 | + return Name(id='False', ctx=Load()) |
| 77 | + else: |
| 78 | + return Name(id='True', ctx=Load()) |
| 79 | + |
| 80 | + compare_op , binop_op = self.get_operators(node.ops[0]) |
| 81 | + # rewrite the first element in list using `==` / `!=` comparison |
| 82 | + eq_expr = Compare( |
| 83 | + left = node.left, |
| 84 | + ops = [compare_op], |
| 85 | + comparators = [node.comparators[0].elts[0]]) |
| 86 | + # join similar comparisons for all othe elements using the appropriate |
| 87 | + # binary operator, i.e. | or & |
| 88 | + for element in node.comparators[0].elts[1:]: |
| 89 | + eq_expr = BinOp( |
| 90 | + left = eq_expr, |
| 91 | + op = binop_op, |
| 92 | + right = Compare( |
| 93 | + left = copy.copy(node.left), |
| 94 | + ops = [compare_op], |
| 95 | + comparators = [element] |
| 96 | + ) |
| 97 | + ) |
| 98 | + return eq_expr |
| 99 | + |
| 100 | + def get_operators(self, op): |
| 101 | + if isinstance(op, In): |
| 102 | + return Eq(), BitOr() |
| 103 | + else: |
| 104 | + return NotEq(), BitAnd() |
| 105 | + |
| 106 | + |
| 107 | +class CachedFactorOptimizer(QueryTransformer): |
| 108 | + """A :class:`QueryTransformer` that converts comparisons containing |
| 109 | + columns with cached factors into comparisons using the factor instead. |
| 110 | +
|
| 111 | + This potentially speeds up queries significantly: |
| 112 | + - By detecting queries that will not return any values without |
| 113 | + scanning the entire column. |
| 114 | + - By evaluating the comparison on the integer typed factor rather than |
| 115 | + a column of a datatype that is more costly to compare, e.g. String. |
| 116 | + |
| 117 | + The `CachedFactorOptimizer` should be followed by the |
| 118 | + :class:`TrivialBooleanExpressionsOptimizer` to obtain the full benefit.""" |
| 119 | + |
| 120 | + def visit_Compare(self, node): |
| 121 | + # first transform all child nodes if necessary |
| 122 | + node = self.generic_visit(node) |
| 123 | + |
| 124 | + # check we have a simple comparison |
| 125 | + if len(node.comparators) != 1 or len(node.ops) != 1: |
| 126 | + return node |
| 127 | + |
| 128 | + # TODO: we currently do not sort the values of the cached |
| 129 | + # factors. Therefore we cannot optimize inequalities |
| 130 | + elif not isinstance(node.ops[0], (Eq, NotEq)): |
| 131 | + return node |
| 132 | + |
| 133 | + # col_name == 'value' |
| 134 | + if isinstance(node.left, Name): |
| 135 | + var = node.left |
| 136 | + val = node.comparators[0] |
| 137 | + # 'value' == col_name |
| 138 | + elif isinstance(node.comparators[0], Name): |
| 139 | + var = node.comparators[0] |
| 140 | + val = node.left |
| 141 | + # we can accelerate expressions that contain at least one column ref |
| 142 | + else: |
| 143 | + return node |
| 144 | + |
| 145 | + col = var.id |
| 146 | + if not self.ctable_.cache_valid(col): |
| 147 | + return node |
| 148 | + |
| 149 | + # find factor id for requested value |
| 150 | + col_values_rootdir = self.ctable_[col].rootdir + '.values' |
| 151 | + carray_values = bcolz.carray(rootdir=col_values_rootdir, |
| 152 | + mode='r') |
| 153 | + idx = None |
| 154 | + # deal with strings and number nodes |
| 155 | + val_field = 's' if isinstance(val, Str) else 'n' |
| 156 | + for index, value in enumerate(carray_values.iter()): |
| 157 | + if value == getattr(val, val_field): |
| 158 | + idx = index |
| 159 | + break |
| 160 | + # value not in cached factorisation |
| 161 | + if idx is None: |
| 162 | + if isinstance(node.ops[0], Eq): |
| 163 | + return Name(id='False', ctx=Load()) |
| 164 | + else: |
| 165 | + return Name(id='True', ctx=Load()) |
| 166 | + |
| 167 | + # found value in cached factorisation: |
| 168 | + # rewrite the comparison expression |
| 169 | + setattr(val, val_field, idx) |
| 170 | + var.id = 'bquery_factors_%s' % col |
| 171 | + # load the factor for later use |
| 172 | + if not self.user_dict.has_key('bquery_factors_%s' % col): |
| 173 | + col_factor_rootdir = self.ctable_[col].rootdir + '.factor' |
| 174 | + self.user_dict['bquery_factors_%s' % col] = \ |
| 175 | + bcolz.carray(rootdir=col_factor_rootdir, mode='r') |
| 176 | + return node |
| 177 | + |
| 178 | + |
| 179 | +class TrivialBooleanExpressionsOptimizer(QueryTransformer): |
| 180 | + """A :class:`QueryTransformer` that simplifies boolean expression |
| 181 | + containing subparts that are trivial boolean expressions. |
| 182 | +
|
| 183 | + Example: |
| 184 | + `(my_col == 'ABC') | (False)` is transformed into |
| 185 | + `False` |
| 186 | +
|
| 187 | + This speeds up queries that can be logically determined to never return |
| 188 | + any entries are not explicitly evaluated against the database.""" |
| 189 | + |
| 190 | + def visit_BinOp(self, node): |
| 191 | + # first transform all child nodes if necessary |
| 192 | + node = self.generic_visit(node) |
| 193 | + |
| 194 | + # only optimize & and | expressions |
| 195 | + if not isinstance(node.op, (BitOr, BitAnd)): |
| 196 | + return node |
| 197 | + |
| 198 | + if isinstance(node.left, Name): |
| 199 | + name_operand = node.left |
| 200 | + other_operand = node.right |
| 201 | + elif isinstance(node.right, Name): |
| 202 | + name_operand = node.right |
| 203 | + other_operand = node.left |
| 204 | + # no Name operand means no trivial boolean expressions |
| 205 | + else: |
| 206 | + return node |
| 207 | + |
| 208 | + # the Name operand is not a trivial boolean expression but a variable |
| 209 | + if name_operand.id not in ['True', 'False']: |
| 210 | + return node |
| 211 | + |
| 212 | + # simplify comparisons containing trivial boolean expression |
| 213 | + if isinstance(node.op, BitOr): |
| 214 | + if name_operand.id == 'True': |
| 215 | + return name_operand |
| 216 | + else: |
| 217 | + return other_operand |
| 218 | + else: |
| 219 | + if name_operand.id == 'False': |
| 220 | + return name_operand |
| 221 | + else: |
| 222 | + return other_operand |
| 223 | + |
| 224 | +# provides a convenient short-cut for configuring a set of standard transformers |
| 225 | +standard_transformers = [InOperatorTransformer(), |
| 226 | + TrivialBooleanExpressionsOptimizer(), |
| 227 | + ] |
0 commit comments