@@ -724,6 +724,43 @@ def visit_lambda(self, node: ast.Lambda):
724724 def visit_order_expr (self , node : ast .OrderExpr ):
725725 return f"{ self .visit (node .expr )} { node .order } "
726726
727+ def __optimize_in_with_string_values (
728+ self , values : list [ast .Expr ], property_source : PrintableMaterializedPropertyGroupItem
729+ ) -> str | None :
730+ """
731+ Optimizes an IN comparison against a list of values for property group bloom filter usage.
732+ Returns the optimized expression string, or None if optimization is not possible.
733+ """
734+ # Bail on the optimisation if any value is not a Constant, is the empty string, is NULL, or is not a string
735+ for v in values :
736+ if not isinstance (v , ast .Constant ):
737+ return None
738+ if v .value == "" or v .value is None or not isinstance (v .value , str ):
739+ return None
740+
741+ # IN with an empty set of values is always false
742+ if len (values ) == 0 :
743+ return "0"
744+
745+ # A problem we run into here is that an expression like
746+ # in(events.properties_group_feature_flags['$feature/onboarding-use-case-selection'], ('control', 'test'))
747+ # does not hit the bloom filter on the key, so we need to modify the expression so that it does
748+
749+ # If only one value, switch to equality operator. Expressions like this will hit the bloom filter for both keys and values:
750+ # events.properties_group_feature_flags['$feature/onboarding-use-case-selection'] = 'control'
751+ if len (values ) == 1 :
752+ return f"equals({ property_source .value_expr } , { self .visit (values [0 ])} )"
753+
754+ # With transform_null_in=1 in SETTINGS (which we have by default), if there are several values, we need to
755+ # include a check for whether the key exists to hit the keys bloom filter.
756+ # Unlike the version WITHOUT mapKeys above, the following expression WILL hit the bloom filter:
757+ # and(has(mapKeys(properties_group_feature_flags), '$feature/onboarding-use-case-selection'),
758+ # in(events.properties_group_feature_flags['$feature/onboarding-use-case-selection'], ('control', 'test')))
759+ # Note that we could add a mapValues to this to use the values bloom filter
760+ # TODO to profile whether we should add mapValues. Probably no for flags, yes for properties.
761+ values_tuple = ", " .join (self .visit (v ) for v in values )
762+ return f"and({ property_source .has_expr } , in({ property_source .value_expr } , tuple({ values_tuple } )))"
763+
727764 def __get_optimized_property_group_compare_operation (self , node : ast .CompareOperation ) -> str | None :
728765 """
729766 Returns a printed expression corresponding to the provided compare operation, if one of the operands is part of
@@ -816,37 +853,15 @@ def __get_optimized_property_group_compare_operation(self, node: ast.CompareOper
816853
817854 if isinstance (node .right , ast .Constant ):
818855 if node .right .value is None :
819- return "0"
820- elif node .right .value == "" :
856+ # we can't optimize here, as the unoptimized version returns true if the key doesn't exist OR the value is null
857+ return None
858+ if node .right .value == "" :
821859 # If the RHS is the empty string, we need to disambiguate it from the default value for missing keys.
822860 return f"and({ property_source .has_expr } , equals({ property_source .value_expr } , { self .visit (node .right )} ))"
823861 elif isinstance (node .right .type , ast .StringType ):
824- return f"in({ property_source .value_expr } , { self .visit (node .right )} )"
825- elif isinstance (node .right , ast .Tuple ):
826- # If any of the values on the RHS are the empty string, we need to disambiguate it from the default
827- # value for missing keys. NULLs should also be dropped, but everything else we can directly compare
828- # (strings) can be passed through as-is
829- default_value_expr : ast .Constant | None = None
830- for expr in node .right .exprs [:]:
831- if not isinstance (expr , ast .Constant ):
832- return None # only optimize constants for now, see above
833- if expr .value is None :
834- node .right .exprs .remove (expr )
835- elif expr .value == "" :
836- default_value_expr = expr
837- node .right .exprs .remove (expr )
838- elif not isinstance (expr .type , ast .StringType ):
839- return None
840- if len (node .right .exprs ) > 0 :
841- # TODO: Check to see if it'd be faster to do equality comparison here instead?
842- printed_expr = f"in({ property_source .value_expr } , { self .visit (node .right )} )"
843- if default_value_expr is not None :
844- printed_expr = f"or({ printed_expr } , and({ property_source .has_expr } , equals({ property_source .value_expr } , { self .visit (default_value_expr )} )))"
845- elif default_value_expr is not None :
846- printed_expr = f"and({ property_source .has_expr } , equals({ property_source .value_expr } , { self .visit (default_value_expr )} ))"
847- else :
848- printed_expr = "0"
849- return printed_expr
862+ return f"equals({ property_source .value_expr } , { self .visit (node .right )} )"
863+ elif isinstance (node .right , ast .Tuple ) or isinstance (node .right , ast .Array ):
864+ return self .__optimize_in_with_string_values (node .right .exprs , property_source )
850865 else :
851866 # TODO: Alias types are not resolved here (similarly to equality operations above) so some expressions
852867 # are not optimized that possibly could be if we took that additional step to determine whether or not
0 commit comments