Skip to content

Commit a3aa630

Browse files
committed
Implement regexp optimization for .*foo.* patterns
1 parent dfb3080 commit a3aa630

File tree

5 files changed

+113
-0
lines changed

5 files changed

+113
-0
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/optimizer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ chrono = { workspace = true }
4949
datafusion-common = { workspace = true, default-features = true }
5050
datafusion-expr = { workspace = true }
5151
datafusion-expr-common = { workspace = true }
52+
datafusion-functions = { workspace = true }
5253
datafusion-physical-expr = { workspace = true }
5354
indexmap = { workspace = true }
5455
itertools = { workspace = true }

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2366,6 +2366,7 @@ mod tests {
23662366
interval_arithmetic::Interval,
23672367
*,
23682368
};
2369+
use datafusion_functions::expr_fn::contains as contains_fn;
23692370
use datafusion_functions_window_common::field::WindowUDFFieldArgs;
23702371
use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
23712372
use datafusion_physical_expr::PhysicalExpr;
@@ -3381,6 +3382,19 @@ mod tests {
33813382
col("c1").like(lit("%foo%")),
33823383
);
33833384

3385+
// regular expression that matches a substring
3386+
assert_change(
3387+
regex_match(col("c1"), lit(".*foo.*")),
3388+
contains_fn(col("c1"), lit("foo")),
3389+
);
3390+
3391+
assert_change(
3392+
regex_not_match(col("c1"), lit(".*foo.*")),
3393+
Expr::Not(Box::new(contains_fn(col("c1"), lit("foo")))),
3394+
);
3395+
3396+
assert_change(regex_match(col("c1"), lit(".*.*")), col("c1").is_not_null());
3397+
33843398
// regular expressions that match an exact literal
33853399
assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit("")));
33863400
assert_change(

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use datafusion_common::tree_node::Transformed;
1919
use datafusion_common::{DataFusionError, Result};
2020
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
21+
use datafusion_functions::expr_fn::contains;
2122
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
2223

2324
use crate::simplify_expressions::expr_simplifier::StringScalar;
@@ -34,6 +35,8 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
3435
///
3536
/// Typical cases this function can simplify:
3637
/// - empty regex pattern to `LIKE '%'`
38+
/// - `EQ .*foo.*` to `contains(left, "foo")`
39+
/// - `NE .*foo.*` to `NOT contains(left, "foo")`
3740
/// - literal regex patterns to `LIKE '%foo%'`
3841
/// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'`
3942
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
@@ -82,6 +85,32 @@ pub fn simplify_regex_expr(
8285
return Ok(Transformed::yes(new_expr));
8386
}
8487

88+
// Convert patterns of the form ".*foo.*" to `contains(left, "foo")`
89+
if !mode.i
90+
&& let Some(inner) = pattern
91+
// If pattern starts and ends with ".*"
92+
.strip_prefix(ANY_CHAR_REGEX_PATTERN)
93+
.and_then(|rest| rest.strip_suffix(ANY_CHAR_REGEX_PATTERN))
94+
// If inner is all non-special characters
95+
&& inner.chars().all(|x| !is_special_character(x))
96+
{
97+
let new_expr = match (mode.not, inner.is_empty()) {
98+
// contains(left, inner)
99+
(false, false) => contains(*left, lit(inner)),
100+
(false, true) => left.is_not_null(),
101+
// not (contains(left, inner))
102+
(true, false) => Expr::Not(Box::new(contains(*left, lit(inner)))),
103+
(true, true) => {
104+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
105+
left,
106+
op,
107+
right,
108+
})));
109+
}
110+
};
111+
return Ok(Transformed::yes(new_expr));
112+
}
113+
85114
match regex_syntax::Parser::new().parse(pattern) {
86115
Ok(hir) => {
87116
let kind = hir.kind();
@@ -202,6 +231,25 @@ fn is_safe_for_like(c: char) -> bool {
202231
(c != '%') && (c != '_')
203232
}
204233

234+
fn is_special_character(c: char) -> bool {
235+
matches!(
236+
c,
237+
'.' | '*'
238+
| '+'
239+
| '?'
240+
| '|'
241+
| '('
242+
| ')'
243+
| '['
244+
| ']'
245+
| '{'
246+
| '}'
247+
| '^'
248+
| '$'
249+
| '\\'
250+
)
251+
}
252+
205253
/// Returns true if the elements in a `Concat` pattern are:
206254
/// - `[Look::Start, Look::End]`
207255
/// - `[Look::Start, Literal(_), Look::End]`

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,56 @@ c
5858

5959
query T
6060
select b from t where b !~ '.*'
61+
62+
# test regex .*literal.* simplifies to contains()
63+
query TT
64+
explain select b from t where b ~ '.*a.*'
65+
----
66+
logical_plan
67+
01)Filter: contains(t.b, Utf8("a"))
68+
02)--TableScan: t projection=[b]
69+
physical_plan
70+
01)FilterExec: contains(b@0, a)
71+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
72+
73+
query T
74+
select b from t where b ~ '.*a.*'
6175
----
76+
a
77+
78+
query TT
79+
explain select b from t where b !~ '.*a.*'
80+
----
81+
logical_plan
82+
01)Filter: NOT contains(t.b, Utf8("a"))
83+
02)--TableScan: t projection=[b]
84+
physical_plan
85+
01)FilterExec: NOT contains(b@0, a)
86+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
87+
88+
query T
89+
select b from t where b !~ '.*a.*'
90+
----
91+
c
92+
93+
query TT
94+
explain select b from t where b ~ '.*.*'
95+
----
96+
logical_plan
97+
01)Filter: t.b IS NOT NULL
98+
02)--TableScan: t projection=[b]
99+
physical_plan
100+
01)FilterExec: b@0 IS NOT NULL
101+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
102+
103+
query T
104+
select b from t where b ~ '.*.*'
105+
----
106+
a
107+
c
108+
109+
query T
110+
select b from t where b !~ '.*.*'
62111

63112
query TT
64113
explain select * from t where a = a;

0 commit comments

Comments
 (0)