Skip to content

Commit abde142

Browse files
robbie-cCopilot
andauthored
fix: Attempt to fix bloom filter usage for property map IN clauses (#42301)
Co-authored-by: Copilot <[email protected]>
1 parent e49e8d7 commit abde142

File tree

2 files changed

+218
-61
lines changed

2 files changed

+218
-61
lines changed

posthog/hogql/printer.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,43 @@ def visit_lambda(self, node: ast.Lambda):
724724
def visit_order_expr(self, node: ast.OrderExpr):
725725
return f"{self.visit(node.expr)} {node.order}"
726726

727+
def __optimize_in_with_string_values(
728+
self, values: list[ast.Expr], property_source: PrintableMaterializedPropertyGroupItem
729+
) -> str | None:
730+
"""
731+
Optimizes an IN comparison against a list of values for property group bloom filter usage.
732+
Returns the optimized expression string, or None if optimization is not possible.
733+
"""
734+
# Bail on the optimisation if any value is not a Constant, is the empty string, is NULL, or is not a string
735+
for v in values:
736+
if not isinstance(v, ast.Constant):
737+
return None
738+
if v.value == "" or v.value is None or not isinstance(v.value, str):
739+
return None
740+
741+
# IN with an empty set of values is always false
742+
if len(values) == 0:
743+
return "0"
744+
745+
# A problem we run into here is that an expression like
746+
# in(events.properties_group_feature_flags['$feature/onboarding-use-case-selection'], ('control', 'test'))
747+
# does not hit the bloom filter on the key, so we need to modify the expression so that it does
748+
749+
# If only one value, switch to equality operator. Expressions like this will hit the bloom filter for both keys and values:
750+
# events.properties_group_feature_flags['$feature/onboarding-use-case-selection'] = 'control'
751+
if len(values) == 1:
752+
return f"equals({property_source.value_expr}, {self.visit(values[0])})"
753+
754+
# With transform_null_in=1 in SETTINGS (which we have by default), if there are several values, we need to
755+
# include a check for whether the key exists to hit the keys bloom filter.
756+
# Unlike the version WITHOUT mapKeys above, the following expression WILL hit the bloom filter:
757+
# and(has(mapKeys(properties_group_feature_flags), '$feature/onboarding-use-case-selection'),
758+
# in(events.properties_group_feature_flags['$feature/onboarding-use-case-selection'], ('control', 'test')))
759+
# Note that we could add a mapValues to this to use the values bloom filter
760+
# TODO to profile whether we should add mapValues. Probably no for flags, yes for properties.
761+
values_tuple = ", ".join(self.visit(v) for v in values)
762+
return f"and({property_source.has_expr}, in({property_source.value_expr}, tuple({values_tuple})))"
763+
727764
def __get_optimized_property_group_compare_operation(self, node: ast.CompareOperation) -> str | None:
728765
"""
729766
Returns a printed expression corresponding to the provided compare operation, if one of the operands is part of
@@ -816,37 +853,15 @@ def __get_optimized_property_group_compare_operation(self, node: ast.CompareOper
816853

817854
if isinstance(node.right, ast.Constant):
818855
if node.right.value is None:
819-
return "0"
820-
elif node.right.value == "":
856+
# we can't optimize here, as the unoptimized version returns true if the key doesn't exist OR the value is null
857+
return None
858+
if node.right.value == "":
821859
# If the RHS is the empty string, we need to disambiguate it from the default value for missing keys.
822860
return f"and({property_source.has_expr}, equals({property_source.value_expr}, {self.visit(node.right)}))"
823861
elif isinstance(node.right.type, ast.StringType):
824-
return f"in({property_source.value_expr}, {self.visit(node.right)})"
825-
elif isinstance(node.right, ast.Tuple):
826-
# If any of the values on the RHS are the empty string, we need to disambiguate it from the default
827-
# value for missing keys. NULLs should also be dropped, but everything else we can directly compare
828-
# (strings) can be passed through as-is
829-
default_value_expr: ast.Constant | None = None
830-
for expr in node.right.exprs[:]:
831-
if not isinstance(expr, ast.Constant):
832-
return None # only optimize constants for now, see above
833-
if expr.value is None:
834-
node.right.exprs.remove(expr)
835-
elif expr.value == "":
836-
default_value_expr = expr
837-
node.right.exprs.remove(expr)
838-
elif not isinstance(expr.type, ast.StringType):
839-
return None
840-
if len(node.right.exprs) > 0:
841-
# TODO: Check to see if it'd be faster to do equality comparison here instead?
842-
printed_expr = f"in({property_source.value_expr}, {self.visit(node.right)})"
843-
if default_value_expr is not None:
844-
printed_expr = f"or({printed_expr}, and({property_source.has_expr}, equals({property_source.value_expr}, {self.visit(default_value_expr)})))"
845-
elif default_value_expr is not None:
846-
printed_expr = f"and({property_source.has_expr}, equals({property_source.value_expr}, {self.visit(default_value_expr)}))"
847-
else:
848-
printed_expr = "0"
849-
return printed_expr
862+
return f"equals({property_source.value_expr}, {self.visit(node.right)})"
863+
elif isinstance(node.right, ast.Tuple) or isinstance(node.right, ast.Array):
864+
return self.__optimize_in_with_string_values(node.right.exprs, property_source)
850865
else:
851866
# TODO: Alias types are not resolved here (similarly to equality operations above) so some expressions
852867
# are not optimized that possibly could be if we took that additional step to determine whether or not

0 commit comments

Comments
 (0)