Skip to content

Commit 8e73844

Browse files
pepijnvealamb
andauthored
#17838 Rewrite regexp_like calls as ~ and *~ operator expressions when possible (#17839)
* #17838 Add simplify implementation for regexp_like that rewrites as operator expressions when possible * Avoid cloning Exprs * Tweak code comments * Add some more sqllogictests --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent d76a1e0 commit 8e73844

File tree

3 files changed

+131
-3
lines changed

3 files changed

+131
-3
lines changed

datafusion/functions/src/regex/regexplike.rs

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,14 @@ use datafusion_common::{
2727
ScalarValue,
2828
};
2929
use datafusion_expr::{
30-
Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
31-
TypeSignatureClass, Volatility,
30+
binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl,
31+
Signature, TypeSignature, TypeSignatureClass, Volatility,
3232
};
3333
use datafusion_macros::user_doc;
3434

35+
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
36+
use datafusion_expr_common::operator::Operator;
37+
use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
3538
use std::any::Any;
3639
use std::sync::Arc;
3740

@@ -153,11 +156,76 @@ impl ScalarUDFImpl for RegexpLikeFunc {
153156
}
154157
}
155158

159+
fn simplify(
160+
&self,
161+
mut args: Vec<Expr>,
162+
info: &dyn SimplifyInfo,
163+
) -> Result<ExprSimplifyResult> {
164+
// Try to simplify regexp_like usage to one of the builtin operators since those have
165+
// optimized code paths for the case where the regular expression pattern is a scalar.
166+
// Additionally, the expression simplification optimization pass will attempt to further
167+
// simplify regular expression patterns used in operator expressions.
168+
let Some(op) = derive_operator(&args) else {
169+
return Ok(ExprSimplifyResult::Original(args));
170+
};
171+
172+
let string_type = info.get_data_type(&args[0])?;
173+
let regexp_type = info.get_data_type(&args[1])?;
174+
let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, &regexp_type);
175+
let Ok((coerced_string_type, coerced_regexp_type)) =
176+
binary_type_coercer.get_input_types()
177+
else {
178+
return Ok(ExprSimplifyResult::Original(args));
179+
};
180+
181+
// regexp_like(str, regexp [, flags])
182+
let regexp = args.swap_remove(1);
183+
let string = args.swap_remove(0);
184+
185+
Ok(ExprSimplifyResult::Simplified(binary_expr(
186+
if string_type != coerced_string_type {
187+
cast(string, coerced_string_type)
188+
} else {
189+
string
190+
},
191+
op,
192+
if regexp_type != coerced_regexp_type {
193+
cast(regexp, coerced_regexp_type)
194+
} else {
195+
regexp
196+
},
197+
)))
198+
}
199+
156200
fn documentation(&self) -> Option<&Documentation> {
157201
self.doc()
158202
}
159203
}
160204

205+
fn derive_operator(args: &[Expr]) -> Option<Operator> {
206+
match args.len() {
207+
// regexp_like(str, regexp, flags)
208+
3 => {
209+
match &args[2] {
210+
Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
211+
match flags.as_str() {
212+
"i" => Some(Operator::RegexIMatch),
213+
"" => Some(Operator::RegexMatch),
214+
// Any flags besides 'i' have no operator equivalent
215+
_ => None,
216+
}
217+
}
218+
// `flags` is not a literal, so we can't derive the correct operator statically
219+
_ => None,
220+
}
221+
}
222+
// regexp_like(str, regexp)
223+
2 => Some(Operator::RegexMatch),
224+
// Should never happen, but just in case
225+
_ => None,
226+
}
227+
}
228+
161229
/// Tests a string using a regular expression returning true if at
162230
/// least one match, false otherwise.
163231
///

datafusion/sqllogictest/test_files/regexp/regexp_like.slt

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,3 +277,63 @@ drop table strings
277277

278278
statement ok
279279
drop table dict_table
280+
281+
# Ensure that regexp_like is rewritten to use the (more optimized) regex operators
282+
statement ok
283+
create table regexp_test as values
284+
('foobar', 'i'),
285+
('Foo', 'i'),
286+
('bar', 'mi') ;
287+
288+
# Expressions that can be rewritten to use the ~ operator (which is more optimized)
289+
# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function)
290+
query TT
291+
explain select
292+
regexp_like(column1, 'fo.*'),
293+
regexp_like(column1, 'fo.*', 'i'),
294+
from regexp_test;
295+
----
296+
logical_plan
297+
01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~* Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))
298+
02)--TableScan: regexp_test projection=[column1]
299+
physical_plan
300+
01)ProjectionExec: expr=[column1@0 ~ fo.* as regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))]
301+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
302+
303+
query BB
304+
select
305+
regexp_like(column1, 'fo.*'),
306+
regexp_like(column1, 'fo.*', 'i'),
307+
from regexp_test;
308+
----
309+
true true
310+
false true
311+
false false
312+
313+
# Expressions that can not be rewritten to use the ~ / ~* operators
314+
# (expect the plans to use the REGEXP_LIKE function)
315+
query TT
316+
explain select
317+
regexp_like(column1, 'f.*r', 'mi'), -- args
318+
regexp_like(column1, 'f.*r', column2) -- non scalar flags
319+
from regexp_test;
320+
----
321+
logical_plan
322+
01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")), regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2)
323+
02)--TableScan: regexp_test projection=[column1, column2]
324+
physical_plan
325+
01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")), regexp_like(column1@0, f.*r, column2@1) as regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)]
326+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
327+
328+
query BB
329+
select
330+
regexp_like(column1, 'f.*r', 'mi'), -- args
331+
regexp_like(column1, 'f.*r', column2) -- non scalar flags
332+
from regexp_test;
333+
----
334+
true true
335+
false false
336+
false false
337+
338+
statement ok
339+
drop table if exists dict_table;

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,7 @@ EXPLAIN SELECT
784784
FROM test;
785785
----
786786
logical_plan
787-
01)Projection: regexp_like(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k
787+
01)Projection: test.column1_utf8view ~ Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k
788788
02)--TableScan: test projection=[column1_utf8view]
789789

790790
## Ensure no casts for REGEXP_MATCH

0 commit comments

Comments
 (0)