1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use datafusion_common:: { DataFusionError , Result , ScalarValue } ;
18+ use datafusion_common:: tree_node:: Transformed ;
19+ use datafusion_common:: { DataFusionError , Result } ;
1920use datafusion_expr:: { BinaryExpr , Expr , Like , Operator , lit} ;
2021use regex_syntax:: hir:: { Capture , Hir , HirKind , Literal , Look } ;
2122
23+ use crate :: simplify_expressions:: expr_simplifier:: StringScalar ;
24+
2225/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
2326const MAX_REGEX_ALTERNATIONS_EXPANSION : usize = 4 ;
2427
@@ -43,52 +46,70 @@ pub fn simplify_regex_expr(
4346 left : Box < Expr > ,
4447 op : Operator ,
4548 right : Box < Expr > ,
46- ) -> Result < Expr > {
47- let mode = OperatorMode :: new ( & op) ;
49+ ) -> Result < Transformed < Expr > > {
50+ // Check if the right operand is a supported string literal
51+ let Some ( string_scalar) = StringScalar :: try_from_expr ( right. as_ref ( ) ) else {
52+ return Ok ( Transformed :: no ( Expr :: BinaryExpr ( BinaryExpr {
53+ left,
54+ op,
55+ right,
56+ } ) ) ) ;
57+ } ;
58+ let pattern = string_scalar. as_str ( ) ;
59+ let Some ( pattern) = pattern else {
60+ return Ok ( Transformed :: no ( Expr :: BinaryExpr ( BinaryExpr {
61+ left,
62+ op,
63+ right,
64+ } ) ) ) ;
65+ } ;
4866
49- if let Expr :: Literal ( ScalarValue :: Utf8 ( Some ( pattern ) ) , _ ) = right . as_ref ( ) {
50- // Handle the special case for ".*" pattern
51- if pattern == ANY_CHAR_REGEX_PATTERN {
52- let new_expr = if mode. not {
53- // not empty
54- let empty_lit = Box :: new ( lit ( "" ) ) ;
55- Expr :: BinaryExpr ( BinaryExpr {
56- left,
57- op : Operator :: Eq ,
58- right : empty_lit,
59- } )
60- } else {
61- // not null
62- left. is_not_null ( )
63- } ;
64- return Ok ( new_expr) ;
65- }
67+ let mode = OperatorMode :: new ( & op ) ;
68+ // Handle the special case for ".*" pattern
69+ if pattern == ANY_CHAR_REGEX_PATTERN {
70+ let new_expr = if mode. not {
71+ // not empty
72+ let empty_lit = Box :: new ( string_scalar . to_expr ( "" ) ) ;
73+ Expr :: BinaryExpr ( BinaryExpr {
74+ left,
75+ op : Operator :: Eq ,
76+ right : empty_lit,
77+ } )
78+ } else {
79+ // not null
80+ left. is_not_null ( )
81+ } ;
82+ return Ok ( Transformed :: yes ( new_expr) ) ;
83+ }
6684
67- match regex_syntax:: Parser :: new ( ) . parse ( pattern) {
68- Ok ( hir) => {
69- let kind = hir. kind ( ) ;
70- if let HirKind :: Alternation ( alts) = kind {
71- if alts. len ( ) <= MAX_REGEX_ALTERNATIONS_EXPANSION
72- && let Some ( expr) = lower_alt ( & mode, & left, alts)
73- {
74- return Ok ( expr) ;
75- }
76- } else if let Some ( expr) = lower_simple ( & mode, & left, & hir) {
77- return Ok ( expr) ;
85+ match regex_syntax:: Parser :: new ( ) . parse ( pattern) {
86+ Ok ( hir) => {
87+ let kind = hir. kind ( ) ;
88+ if let HirKind :: Alternation ( alts) = kind {
89+ if alts. len ( ) <= MAX_REGEX_ALTERNATIONS_EXPANSION
90+ && let Some ( expr) = lower_alt ( & mode, & left, alts, & string_scalar)
91+ {
92+ return Ok ( Transformed :: yes ( expr) ) ;
7893 }
79- }
80- Err ( e) => {
81- // error out early since the execution may fail anyways
82- return Err ( DataFusionError :: Context (
83- "Invalid regex" . to_owned ( ) ,
84- Box :: new ( DataFusionError :: External ( Box :: new ( e) ) ) ,
85- ) ) ;
94+ } else if let Some ( expr) = lower_simple ( & mode, & left, & hir, & string_scalar) {
95+ return Ok ( Transformed :: yes ( expr) ) ;
8696 }
8797 }
98+ Err ( e) => {
99+ // error out early since the execution may fail anyways
100+ return Err ( DataFusionError :: Context (
101+ "Invalid regex" . to_owned ( ) ,
102+ Box :: new ( DataFusionError :: External ( Box :: new ( e) ) ) ,
103+ ) ) ;
104+ }
88105 }
89106
90107 // Leave untouched if optimization didn't work
91- Ok ( Expr :: BinaryExpr ( BinaryExpr { left, op, right } ) )
108+ Ok ( Transformed :: no ( Expr :: BinaryExpr ( BinaryExpr {
109+ left,
110+ op,
111+ right,
112+ } ) ) )
92113}
93114
94115#[ derive( Debug ) ]
@@ -117,11 +138,11 @@ impl OperatorMode {
117138 }
118139
119140 /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
120- fn expr ( & self , expr : Box < Expr > , pattern : String ) -> Expr {
141+ fn expr ( & self , expr : Box < Expr > , pattern : Box < Expr > ) -> Expr {
121142 let like = Like {
122143 negated : self . not ,
123144 expr,
124- pattern : Box :: new ( Expr :: Literal ( ScalarValue :: from ( pattern ) , None ) ) ,
145+ pattern,
125146 escape_char : None ,
126147 case_insensitive : self . i ,
127148 } ;
@@ -311,14 +332,24 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
311332}
312333
313334/// Tries to lower (transform) a simple regex pattern to a LIKE expression.
314- fn lower_simple ( mode : & OperatorMode , left : & Expr , hir : & Hir ) -> Option < Expr > {
335+ fn lower_simple (
336+ mode : & OperatorMode ,
337+ left : & Expr ,
338+ hir : & Hir ,
339+ string_scalar : & StringScalar ,
340+ ) -> Option < Expr > {
315341 match hir. kind ( ) {
316342 HirKind :: Empty => {
317- return Some ( mode. expr ( Box :: new ( left. clone ( ) ) , "%" . to_owned ( ) ) ) ;
343+ return Some (
344+ mode. expr ( Box :: new ( left. clone ( ) ) , Box :: new ( string_scalar. to_expr ( "%" ) ) ) ,
345+ ) ;
318346 }
319347 HirKind :: Literal ( l) => {
320348 let s = like_str_from_literal ( l) ?;
321- return Some ( mode. expr ( Box :: new ( left. clone ( ) ) , format ! ( "%{s}%" ) ) ) ;
349+ return Some ( mode. expr (
350+ Box :: new ( left. clone ( ) ) ,
351+ Box :: new ( string_scalar. to_expr ( & format ! ( "%{s}%" ) ) ) ,
352+ ) ) ;
322353 }
323354 HirKind :: Concat ( inner) if is_anchored_literal ( inner) => {
324355 return anchored_literal_to_expr ( inner) . map ( |right| {
@@ -333,7 +364,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
333364 if let Some ( pattern) = partial_anchored_literal_to_like ( inner)
334365 . or_else ( || collect_concat_to_like_string ( inner) )
335366 {
336- return Some ( mode. expr ( Box :: new ( left. clone ( ) ) , pattern) ) ;
367+ return Some ( mode. expr (
368+ Box :: new ( left. clone ( ) ) ,
369+ Box :: new ( string_scalar. to_expr ( & pattern) ) ,
370+ ) ) ;
337371 }
338372 }
339373 _ => { }
@@ -344,11 +378,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
344378/// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
345379/// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
346380/// function to return `None`.
347- fn lower_alt ( mode : & OperatorMode , left : & Expr , alts : & [ Hir ] ) -> Option < Expr > {
381+ fn lower_alt (
382+ mode : & OperatorMode ,
383+ left : & Expr ,
384+ alts : & [ Hir ] ,
385+ string_scalar : & StringScalar ,
386+ ) -> Option < Expr > {
348387 let mut accu: Option < Expr > = None ;
349388
350389 for part in alts {
351- if let Some ( expr) = lower_simple ( mode, left, part) {
390+ if let Some ( expr) = lower_simple ( mode, left, part, string_scalar ) {
352391 accu = match accu {
353392 Some ( accu) => {
354393 if mode. not {
0 commit comments