Skip to content

Commit aab2010

Browse files
authored
feat: Strings with Unicode Escapes (#140)
1 parent 824a1a3 commit aab2010

File tree

7 files changed

+91
-5
lines changed

7 files changed

+91
-5
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ cranelift-module = { version = "0.82.0", optional = true }
4444
ordered-float = "2.10"
4545
parquet = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "9f2e2862f3f5e5efb1f83364b3ac8492f776a92d", features = ["arrow"], optional = true }
4646
pyo3 = { version = "0.16", optional = true }
47-
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "dca8d9081fd8ad7c01302c112d204edc74e67170" }
47+
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "b7f265a4590049f8274cef8411f63eddb5b4bf87" }

datafusion/core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ pin-project-lite= "^0.2.7"
7979
pyo3 = { version = "0.16", optional = true }
8080
rand = "0.8"
8181
smallvec = { version = "1.6", features = ["union"] }
82-
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "dca8d9081fd8ad7c01302c112d204edc74e67170" }
82+
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "b7f265a4590049f8274cef8411f63eddb5b4bf87" }
8383
tempfile = "3"
8484
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
8585
tokio-stream = "0.1"

datafusion/core/src/sql/planner.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,6 +1706,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
17061706
SQLExpr::Value(Value::Number(n, _)) => parse_sql_number(&n),
17071707
SQLExpr::Value(Value::SingleQuotedString(ref s)) => Ok(lit(s.clone())),
17081708
SQLExpr::Value(Value::EscapedStringLiteral(ref s)) => Ok(lit(s.clone())),
1709+
SQLExpr::Value(Value::UnicodeEscapedStringLiteral(ref s)) => parse_unicode_escaped_string(s, '\\'),
17091710
SQLExpr::Value(Value::Boolean(n)) => Ok(lit(n)),
17101711
SQLExpr::Value(Value::Null) => Ok(Expr::Literal(ScalarValue::Null)),
17111712
SQLExpr::Extract { field, expr } => Ok(Expr::ScalarFunction {
@@ -2859,6 +2860,53 @@ fn parse_sql_number(n: &str) -> Result<Expr> {
28592860
}
28602861
}
28612862

2863+
fn parse_unicode_escaped_string(s: &str, delimiter: char) -> Result<Expr> {
2864+
let mut result = String::new();
2865+
let mut chars = s.char_indices().peekable();
2866+
while let Some((i, c)) = chars.next() {
2867+
if c == delimiter {
2868+
if let Some((_, next)) = chars.peek() {
2869+
if next == &delimiter {
2870+
result.push(delimiter);
2871+
chars.next();
2872+
} else {
2873+
let (parsed, len) =
2874+
parse_unicode_escaped_point(&s[i + 1..], delimiter)?;
2875+
result.push(parsed);
2876+
chars.nth(len - 1);
2877+
}
2878+
} else {
2879+
return Err(invalid_unicode_escape_error(s, delimiter));
2880+
}
2881+
} else {
2882+
result.push(c)
2883+
}
2884+
}
2885+
Ok(lit(result))
2886+
}
2887+
2888+
fn parse_unicode_escaped_point(s: &str, delimiter: char) -> Result<(char, usize)> {
2889+
let (point_start, point_end) = if s.starts_with('+') { (1, 7) } else { (0, 4) };
2890+
if point_end <= s.len() {
2891+
let byte = u32::from_str_radix(&s[point_start..point_end], 16)
2892+
.map_err(|_| invalid_unicode_escape_error(s, delimiter))?;
2893+
if let Some(c) = char::from_u32(byte) {
2894+
Ok((c, point_end))
2895+
} else {
2896+
Err(invalid_unicode_escape_error(s, delimiter))
2897+
}
2898+
} else {
2899+
Err(invalid_unicode_escape_error(s, delimiter))
2900+
}
2901+
}
2902+
2903+
fn invalid_unicode_escape_error(s: &str, delimiter: char) -> DataFusionError {
2904+
DataFusionError::SQL(ParserError(format!(
2905+
"Invalid Unicode escape in {}. Unicode escapes must be {}XXXX or {}+XXXXXX",
2906+
s, delimiter, delimiter,
2907+
)))
2908+
}
2909+
28622910
#[cfg(test)]
28632911
mod tests {
28642912
use crate::datasource::empty::EmptyTable;
@@ -2867,6 +2915,36 @@ mod tests {
28672915

28682916
use super::*;
28692917

2918+
#[test]
2919+
fn test_parse_unicode_escaped_string() {
2920+
assert_eq!(
2921+
parse_unicode_escaped_string("pppp", '\\').unwrap(),
2922+
Expr::Literal(ScalarValue::Utf8(Some("pppp".to_string())))
2923+
);
2924+
assert_eq!(
2925+
parse_unicode_escaped_string("d\\0061t\\+000061", '\\').unwrap(),
2926+
Expr::Literal(ScalarValue::Utf8(Some("data".to_string())))
2927+
);
2928+
assert_eq!(
2929+
parse_unicode_escaped_string("d\\0061\\\\t\\+000061", '\\').unwrap(),
2930+
Expr::Literal(ScalarValue::Utf8(Some("da\\ta".to_string())))
2931+
);
2932+
assert_eq!(
2933+
parse_unicode_escaped_string("d!0061t\\!+000061\\", '!').unwrap(),
2934+
Expr::Literal(ScalarValue::Utf8(Some("dat\\a\\".to_string())))
2935+
);
2936+
assert_eq!(
2937+
parse_unicode_escaped_string("!!d!0061!!t\\!+000061\\", '!').unwrap(),
2938+
Expr::Literal(ScalarValue::Utf8(Some("!da!t\\a\\".to_string())))
2939+
);
2940+
assert_eq!(
2941+
parse_unicode_escaped_string("d!0061t\\!+000061\\", '!').unwrap(),
2942+
Expr::Literal(ScalarValue::Utf8(Some("dat\\a\\".to_string())))
2943+
);
2944+
assert!(parse_unicode_escaped_string("d\\0061t\\+000061\\", '\\').is_err());
2945+
assert!(parse_unicode_escaped_string("d\\0061t\\+061", '\\').is_err());
2946+
assert!(parse_unicode_escaped_string("d\\H061t\\+061", '\\').is_err());
2947+
}
28702948
#[test]
28712949
fn select_no_relation() {
28722950
quick_test(

datafusion/core/tests/sql/expr.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,14 @@ async fn test_like() -> Result<()> {
721721
Ok(())
722722
}
723723

724+
#[tokio::test]
725+
async fn test_unicode_escaped() -> Result<()> {
726+
test_expression!("U&'pppp'", "pppp");
727+
test_expression!("U&'d\\0061t\\+000061\'", "data");
728+
729+
Ok(())
730+
}
731+
724732
#[tokio::test]
725733
async fn test_ilike() -> Result<()> {
726734
test_expression!("'test' ILIKE '%Es%'", "true");

datafusion/expr/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ path = "src/lib.rs"
3838
ahash = { version = "0.7", default-features = false }
3939
arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "9f2e2862f3f5e5efb1f83364b3ac8492f776a92d", features = ["prettyprint"] }
4040
datafusion-common = { path = "../common", version = "7.0.0" }
41-
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "dca8d9081fd8ad7c01302c112d204edc74e67170" }
41+
sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "b7f265a4590049f8274cef8411f63eddb5b4bf87" }

0 commit comments

Comments
 (0)