Skip to content

Commit dfb3080

Browse files
committed
Copy over changes from PR #20581
1 parent acec058 commit dfb3080

File tree

4 files changed

+146
-73
lines changed

4 files changed

+146
-73
lines changed

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,17 +1660,19 @@ impl TreeNodeRewriter for Simplifier<'_> {
16601660
left,
16611661
op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
16621662
right,
1663-
}) => Transformed::yes(simplify_regex_expr(left, op, right)?),
1663+
}) => simplify_regex_expr(left, op, right)?,
16641664

16651665
// Rules for Like
16661666
Expr::Like(like) => {
16671667
// `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291
16681668
let escape_char = like.escape_char.unwrap_or('\\');
1669-
match as_string_scalar(&like.pattern) {
1670-
Some((data_type, pattern_str)) => {
1669+
1670+
match StringScalar::try_from_expr(&like.pattern) {
1671+
Some(string_scalar) => {
1672+
let pattern_str = string_scalar.as_str();
16711673
match pattern_str {
16721674
None => return Ok(Transformed::yes(lit_bool_null())),
1673-
Some(pattern_str) if pattern_str == "%" => {
1675+
Some("%") => {
16741676
// exp LIKE '%' is
16751677
// - when exp is not NULL, it's true
16761678
// - when exp is NULL, it's NULL
@@ -1702,10 +1704,9 @@ impl TreeNodeRewriter for Simplifier<'_> {
17021704
.replace_all(pattern_str, "%")
17031705
.to_string();
17041706
Transformed::yes(Expr::Like(Like {
1705-
pattern: Box::new(to_string_scalar(
1706-
&data_type,
1707-
Some(simplified_pattern),
1708-
)),
1707+
pattern: Box::new(
1708+
string_scalar.to_expr(&simplified_pattern),
1709+
),
17091710
..like
17101711
}))
17111712
}
@@ -2126,21 +2127,54 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool {
21262127
}
21272128
}
21282129

2129-
fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
2130-
match expr {
2131-
Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),
2132-
Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)),
2133-
Expr::Literal(ScalarValue::Utf8View(s), _) => Some((DataType::Utf8View, s)),
2134-
_ => None,
2135-
}
2130+
/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View)
2131+
pub(crate) enum StringScalar<'a> {
2132+
Utf8(&'a ScalarValue),
2133+
LargeUtf8(&'a ScalarValue),
2134+
Utf8View(&'a ScalarValue),
21362135
}
21372136

2138-
fn to_string_scalar(data_type: &DataType, value: Option<String>) -> Expr {
2139-
match data_type {
2140-
DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None),
2141-
DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None),
2142-
DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value), None),
2143-
_ => unreachable!(),
2137+
impl<'a> StringScalar<'a> {
2138+
/// Create a `StringScalar` view from an `Expr` if it is a supported string literal.
2139+
/// Returns `None` if the expression is not a string literal.
2140+
pub(crate) fn try_from_expr(expr: &'a Expr) -> Option<Self> {
2141+
match expr {
2142+
Expr::Literal(scalar, _) => Self::try_from_scalar(scalar),
2143+
_ => None,
2144+
}
2145+
}
2146+
2147+
/// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type.
2148+
/// Returns `None` if the scalar value is not a supported string type.
2149+
fn try_from_scalar(scalar: &'a ScalarValue) -> Option<Self> {
2150+
match scalar {
2151+
ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)),
2152+
ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)),
2153+
ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)),
2154+
_ => None,
2155+
}
2156+
}
2157+
2158+
/// Returns the underlying string slice.
2159+
pub(crate) fn as_str(&self) -> Option<&'a str> {
2160+
match self {
2161+
Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => {
2162+
scalar.try_as_str().flatten()
2163+
}
2164+
}
2165+
}
2166+
2167+
/// Build a new `Expr` of the same string type with the given value.
2168+
pub(crate) fn to_expr(&self, val: &str) -> Expr {
2169+
match self {
2170+
Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None),
2171+
Self::LargeUtf8(_) => {
2172+
Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None)
2173+
}
2174+
Self::Utf8View(_) => {
2175+
Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None)
2176+
}
2177+
}
21442178
}
21452179
}
21462180

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,13 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use datafusion_common::{DataFusionError, Result, ScalarValue};
18+
use datafusion_common::tree_node::Transformed;
19+
use datafusion_common::{DataFusionError, Result};
1920
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
2021
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
2122

23+
use crate::simplify_expressions::expr_simplifier::StringScalar;
24+
2225
/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
2326
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
2427

@@ -43,52 +46,70 @@ pub fn simplify_regex_expr(
4346
left: Box<Expr>,
4447
op: Operator,
4548
right: Box<Expr>,
46-
) -> Result<Expr> {
47-
let mode = OperatorMode::new(&op);
49+
) -> Result<Transformed<Expr>> {
50+
// Check if the right operand is a supported string literal
51+
let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else {
52+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
53+
left,
54+
op,
55+
right,
56+
})));
57+
};
58+
let pattern = string_scalar.as_str();
59+
let Some(pattern) = pattern else {
60+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
61+
left,
62+
op,
63+
right,
64+
})));
65+
};
4866

49-
if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() {
50-
// Handle the special case for ".*" pattern
51-
if pattern == ANY_CHAR_REGEX_PATTERN {
52-
let new_expr = if mode.not {
53-
// not empty
54-
let empty_lit = Box::new(lit(""));
55-
Expr::BinaryExpr(BinaryExpr {
56-
left,
57-
op: Operator::Eq,
58-
right: empty_lit,
59-
})
60-
} else {
61-
// not null
62-
left.is_not_null()
63-
};
64-
return Ok(new_expr);
65-
}
67+
let mode = OperatorMode::new(&op);
68+
// Handle the special case for ".*" pattern
69+
if pattern == ANY_CHAR_REGEX_PATTERN {
70+
let new_expr = if mode.not {
71+
// not empty
72+
let empty_lit = Box::new(string_scalar.to_expr(""));
73+
Expr::BinaryExpr(BinaryExpr {
74+
left,
75+
op: Operator::Eq,
76+
right: empty_lit,
77+
})
78+
} else {
79+
// not null
80+
left.is_not_null()
81+
};
82+
return Ok(Transformed::yes(new_expr));
83+
}
6684

67-
match regex_syntax::Parser::new().parse(pattern) {
68-
Ok(hir) => {
69-
let kind = hir.kind();
70-
if let HirKind::Alternation(alts) = kind {
71-
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
72-
&& let Some(expr) = lower_alt(&mode, &left, alts)
73-
{
74-
return Ok(expr);
75-
}
76-
} else if let Some(expr) = lower_simple(&mode, &left, &hir) {
77-
return Ok(expr);
85+
match regex_syntax::Parser::new().parse(pattern) {
86+
Ok(hir) => {
87+
let kind = hir.kind();
88+
if let HirKind::Alternation(alts) = kind {
89+
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
90+
&& let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar)
91+
{
92+
return Ok(Transformed::yes(expr));
7893
}
79-
}
80-
Err(e) => {
81-
// error out early since the execution may fail anyways
82-
return Err(DataFusionError::Context(
83-
"Invalid regex".to_owned(),
84-
Box::new(DataFusionError::External(Box::new(e))),
85-
));
94+
} else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) {
95+
return Ok(Transformed::yes(expr));
8696
}
8797
}
98+
Err(e) => {
99+
// error out early since the execution may fail anyways
100+
return Err(DataFusionError::Context(
101+
"Invalid regex".to_owned(),
102+
Box::new(DataFusionError::External(Box::new(e))),
103+
));
104+
}
88105
}
89106

90107
// Leave untouched if optimization didn't work
91-
Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
108+
Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
109+
left,
110+
op,
111+
right,
112+
})))
92113
}
93114

94115
#[derive(Debug)]
@@ -117,11 +138,11 @@ impl OperatorMode {
117138
}
118139

119140
/// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
120-
fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
141+
fn expr(&self, expr: Box<Expr>, pattern: Box<Expr>) -> Expr {
121142
let like = Like {
122143
negated: self.not,
123144
expr,
124-
pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)),
145+
pattern,
125146
escape_char: None,
126147
case_insensitive: self.i,
127148
};
@@ -311,14 +332,24 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
311332
}
312333

313334
/// Tries to lower (transform) a simple regex pattern to a LIKE expression.
314-
fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
335+
fn lower_simple(
336+
mode: &OperatorMode,
337+
left: &Expr,
338+
hir: &Hir,
339+
string_scalar: &StringScalar,
340+
) -> Option<Expr> {
315341
match hir.kind() {
316342
HirKind::Empty => {
317-
return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
343+
return Some(
344+
mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))),
345+
);
318346
}
319347
HirKind::Literal(l) => {
320348
let s = like_str_from_literal(l)?;
321-
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
349+
return Some(mode.expr(
350+
Box::new(left.clone()),
351+
Box::new(string_scalar.to_expr(&format!("%{s}%"))),
352+
));
322353
}
323354
HirKind::Concat(inner) if is_anchored_literal(inner) => {
324355
return anchored_literal_to_expr(inner).map(|right| {
@@ -333,7 +364,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
333364
if let Some(pattern) = partial_anchored_literal_to_like(inner)
334365
.or_else(|| collect_concat_to_like_string(inner))
335366
{
336-
return Some(mode.expr(Box::new(left.clone()), pattern));
367+
return Some(mode.expr(
368+
Box::new(left.clone()),
369+
Box::new(string_scalar.to_expr(&pattern)),
370+
));
337371
}
338372
}
339373
_ => {}
@@ -344,11 +378,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
344378
/// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
345379
/// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
346380
/// function to return `None`.
347-
fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
381+
fn lower_alt(
382+
mode: &OperatorMode,
383+
left: &Expr,
384+
alts: &[Hir],
385+
string_scalar: &StringScalar,
386+
) -> Option<Expr> {
348387
let mut accu: Option<Expr> = None;
349388

350389
for part in alts {
351-
if let Some(expr) = lower_simple(mode, left, part) {
390+
if let Some(expr) = lower_simple(mode, left, part, string_scalar) {
352391
accu = match accu {
353392
Some(accu) => {
354393
if mode.not {

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,20 @@ query TT
3434
explain select b from t where b ~ '.*'
3535
----
3636
logical_plan
37-
01)Filter: t.b ~ Utf8View(".*")
37+
01)Filter: t.b IS NOT NULL
3838
02)--TableScan: t projection=[b]
3939
physical_plan
40-
01)FilterExec: b@0 ~ .*
40+
01)FilterExec: b@0 IS NOT NULL
4141
02)--DataSourceExec: partitions=1, partition_sizes=[1]
4242

4343
query TT
4444
explain select b from t where b !~ '.*'
4545
----
4646
logical_plan
47-
01)Filter: t.b !~ Utf8View(".*")
47+
01)Filter: t.b = Utf8View("")
4848
02)--TableScan: t projection=[b]
4949
physical_plan
50-
01)FilterExec: b@0 !~ .*
50+
01)FilterExec: b@0 =
5151
02)--DataSourceExec: partitions=1, partition_sizes=[1]
5252

5353
query T

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,7 @@ EXPLAIN SELECT
11001100
FROM test;
11011101
----
11021102
logical_plan
1103-
01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
1103+
01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1
11041104
02)--TableScan: test projection=[column1_utf8view]
11051105

11061106
# `~*` operator (regex match case-insensitive)

0 commit comments

Comments
 (0)