Skip to content

Commit 0016c16

Browse files
authored
Merge branch 'main' into sqllogic-test-split
2 parents b9f274c + b8563d2 commit 0016c16

File tree

18 files changed

+299
-25
lines changed

18 files changed

+299
-25
lines changed

.github/actions/test_compat_client_cluster/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ runs:
2525
- name: Setup Go
2626
uses: actions/setup-go@v5
2727
with:
28-
go-version: "1.24"
28+
go-version: "1.25"
2929

3030
- name: checkout databend-jdbc
3131
uses: actions/checkout@v4

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/hashtable/src/README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Hashtable implementation in rust
22

3-
4-
This package is a port from its cpp implementation from [ClickHouse](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/HashTable.h).
3+
This package is a port from its cpp implementation from [ClickHouse].
54
It is quite difficult to implement a high-performance hashtable from scratch. After comparing many implementations of hashtables, we found that ClickHouse's hashtable is the most suitable for OLAP systems. Therefore, we rewrote it into this Rust library.

src/common/io/src/cursor_ext/cursor_read_datetime_ext.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,11 @@ fn get_local_time(tz: &TimeZone, d: &Date, times: &mut Vec<u32>) -> Result<Zoned
437437
if let Some(zoned) = fast_local_to_zoned(tz, d, hour, minute, second, 0) {
438438
return Ok(zoned);
439439
}
440-
tz.to_zoned(d.at(hour as i8, minute as i8, second as i8, 0))
440+
let time = Time::new(hour as i8, minute as i8, second as i8, 0)
441+
.map_err_to_code(ErrorCode::BadBytes, || {
442+
format!("Invalid time {:02}:{:02}:{:02}", hour, minute, second)
443+
})?;
444+
tz.to_zoned(d.to_datetime(time))
441445
.map_err_to_code(ErrorCode::BadBytes, || {
442446
format!("Invalid time provided in times: {:?}", times)
443447
})
@@ -490,9 +494,23 @@ fn build_zoned_from_components(
490494
if let Some(zoned) = fast_local_to_zoned(tz, date, hour, minute, second, micro) {
491495
return Ok(zoned);
492496
}
497+
let time = Time::new(hour as i8, minute as i8, second as i8, 0).map_err_to_code(
498+
ErrorCode::BadBytes,
499+
|| {
500+
format!(
501+
"Invalid local time {:04}-{:02}-{:02} {:02}:{:02}:{:02}",
502+
i32::from(date.year()),
503+
date.month(),
504+
date.day(),
505+
hour,
506+
minute,
507+
second
508+
)
509+
},
510+
)?;
493511

494512
let base = tz
495-
.to_zoned(date.at(hour as i8, minute as i8, second as i8, 0))
513+
.to_zoned(date.to_datetime(time))
496514
.map_err_to_code(ErrorCode::BadBytes, || {
497515
format!(
498516
"Invalid local time {:04}-{:02}-{:02} {:02}:{:02}:{:02}",

src/query/ast/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "databend-common-ast"
3-
version = "0.2.4"
3+
version = "0.2.5"
44
publish = true
55
description = "SQL parser for Databend"
66
authors = { workspace = true }

src/query/ast/src/ast/quote.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ use std::iter::Peekable;
1818
use std::str::FromStr;
1919

2020
use crate::parser::Dialect;
21+
use crate::parser::token::is_ident_continue;
22+
use crate::parser::token::is_ident_start;
2123

22-
// In ANSI SQL, it does not need to quote an identifier if the identifier matches
23-
// the following regular expression: [A-Za-z_][A-Za-z0-9_$]*.
24+
// An identifier does not need quoting if it matches the rules defined by
25+
// is_ident_start / is_ident_continue (ASCII letters, underscore, digits, `$`,
26+
// plus any Unicode Alphabetic character — CJK, Cyrillic, etc.).
2427
//
2528
// There are also two known special cases in Databend which do not require quoting:
2629
// - "~" is a valid stage name
@@ -37,12 +40,12 @@ pub fn ident_needs_quote(ident: &str) -> bool {
3740

3841
let mut chars = ident.chars();
3942
let first = chars.next().unwrap();
40-
if !first.is_ascii_alphabetic() && first != '_' {
43+
if !is_ident_start(first) {
4144
return true;
4245
}
4346

4447
for c in chars {
45-
if !c.is_ascii_alphanumeric() && c != '_' && c != '$' {
48+
if !is_ident_continue(c) {
4649
return true;
4750
}
4851
}
@@ -57,7 +60,11 @@ pub fn display_ident(
5760
dialect: Dialect,
5861
) -> String {
5962
// Db-s -> "Db-s" ; dbs -> dbs
60-
if name.chars().any(|c| c.is_ascii_uppercase()) && quoted_ident_case_sensitive
63+
// Quote the identifier if it would change under to_lowercase(), so that
64+
// round-tripping through normalize_identifier (which lowercases unquoted
65+
// idents) preserves the original name. This covers both uppercase (Lu)
66+
// and titlecase (Lt) Unicode characters — e.g. Dž (U+01C5).
67+
if name != name.to_lowercase() && quoted_ident_case_sensitive
6168
|| ident_needs_quote(name)
6269
|| force_quoted_ident
6370
{
@@ -73,7 +80,7 @@ pub fn ident_opt_quote(
7380
quoted_ident_case_sensitive: bool,
7481
dialect: Dialect,
7582
) -> Option<char> {
76-
if name.chars().any(|c| c.is_ascii_uppercase()) && quoted_ident_case_sensitive
83+
if name != name.to_lowercase() && quoted_ident_case_sensitive
7784
|| ident_needs_quote(name)
7885
|| force_quoted_ident
7986
{

src/query/ast/src/parser/token.rs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,45 @@ impl<'a> Iterator for Tokenizer<'a> {
137137
}
138138
}
139139

140+
/// Returns true if `c` can start an unquoted identifier.
141+
///
142+
/// ASCII letters and underscore are accepted, plus any Unicode character with
143+
/// the Alphabetic derived property (`char::is_alphabetic()` — covers CJK,
144+
/// Cyrillic, Arabic, etc. while excluding separators, punctuation, and symbols).
145+
pub fn is_ident_start(c: char) -> bool {
146+
c == '_' || c.is_alphabetic()
147+
}
148+
149+
/// Returns true if `c` can continue an unquoted identifier.
150+
///
151+
/// Same as `is_ident_start`, plus ASCII digits and `$`.
152+
/// Non-ASCII digits are also accepted via `char::is_alphanumeric()`.
153+
pub fn is_ident_continue(c: char) -> bool {
154+
c == '_' || c == '$' || c.is_alphanumeric()
155+
}
156+
157+
fn bump_ident_continue(lex: &mut Lexer<TokenKind>) {
158+
loop {
159+
match lex.remainder().chars().next() {
160+
Some(c) if is_ident_continue(c) => lex.bump(c.len_utf8()),
161+
_ => break,
162+
}
163+
}
164+
}
165+
166+
fn lex_ascii_ident(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
167+
bump_ident_continue(lex);
168+
logos::FilterResult::Emit(())
169+
}
170+
171+
fn lex_unicode_ident(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
172+
if !is_ident_start(lex.slice().chars().next().unwrap()) {
173+
return logos::FilterResult::Error;
174+
}
175+
bump_ident_continue(lex);
176+
logos::FilterResult::Emit(())
177+
}
178+
140179
fn lex_comment_block(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
141180
let remainder = lex.remainder().as_bytes();
142181

@@ -167,7 +206,8 @@ pub enum TokenKind {
167206
#[token("/*", lex_comment_block)]
168207
CommentBlock,
169208

170-
#[regex(r#"[_a-zA-Z][_$a-zA-Z0-9]*"#)]
209+
#[regex(r#"[_a-zA-Z][_$a-zA-Z0-9]*"#, lex_ascii_ident)]
210+
#[regex(r"[^\x00-\x7f]", lex_unicode_ident)]
171211
Ident,
172212

173213
#[regex(r#"\$[_a-zA-Z][_$a-zA-Z0-9]*"#)]

src/query/ast/tests/it/parser.rs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1738,7 +1738,7 @@ fn test_quote() {
17381738
("a\\\"b", "\"a\\\"\"b\""),
17391739
("12", "\"12\""),
17401740
("🍣", "\"🍣\""),
1741-
("価格", "\"価格\""),
1741+
("価格", "価格"),
17421742
("\t", "\"\t\""),
17431743
("complex \"string\"", "\"complex \"\"string\"\"\""),
17441744
("\"\"\"", "\"\"\"\"\"\"\"\""),
@@ -1759,3 +1759,43 @@ fn test_quote() {
17591759
};
17601760
}
17611761
}
1762+
1763+
#[test]
1764+
fn test_unicode_ident_tokenize() {
1765+
let cases = &[
1766+
("中文", vec![(TokenKind::Ident, "中文")]),
1767+
("価格", vec![(TokenKind::Ident, "価格")]),
1768+
("SELECT 'a' AS 中文", vec![
1769+
(TokenKind::SELECT, "SELECT"),
1770+
(TokenKind::LiteralString, "'a'"),
1771+
(TokenKind::AS, "AS"),
1772+
(TokenKind::Ident, "中文"),
1773+
]),
1774+
// Mixed ASCII and Unicode
1775+
("abc中文", vec![(TokenKind::Ident, "abc中文")]),
1776+
("abc中文123", vec![(TokenKind::Ident, "abc中文123")]),
1777+
("_中文", vec![(TokenKind::Ident, "_中文")]),
1778+
("_列名_1", vec![(TokenKind::Ident, "_列名_1")]),
1779+
// Non-BMP but Alphabetic (CJK Extension B U+20000) — still valid
1780+
("𠀀", vec![(TokenKind::Ident, "𠀀")]),
1781+
("abc𠀀123", vec![(TokenKind::Ident, "abc𠀀123")]),
1782+
("_𠀀$1", vec![(TokenKind::Ident, "_𠀀$1")]),
1783+
];
1784+
1785+
for (input, expected) in cases {
1786+
let tokens: Vec<_> = Tokenizer::new(input)
1787+
.map(|t| t.unwrap())
1788+
.filter(|t| t.kind != TokenKind::EOI)
1789+
.map(|t| (t.kind, t.text().to_string()))
1790+
.collect();
1791+
let tokens: Vec<_> = tokens.iter().map(|(k, s)| (*k, s.as_str())).collect();
1792+
assert_eq!(tokens, *expected, "input: {input}");
1793+
}
1794+
1795+
// Emoji is not Alphabetic, tokenizer must return an error
1796+
let err_cases = &["🍣"];
1797+
for input in err_cases {
1798+
let result: std::result::Result<Vec<_>, _> = Tokenizer::new(input).collect();
1799+
assert!(result.is_err(), "expected error for input: {input}");
1800+
}
1801+
}

src/query/expression/src/types/timestamp_tz.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use databend_common_exception::ErrorCode;
2121
use databend_common_exception::Result;
2222
use databend_common_io::datetime::parse_standard_timestamp as parse_iso_timestamp;
2323
use jiff::civil::Date;
24+
use jiff::civil::Time;
2425
use jiff::fmt;
2526
use jiff::tz;
2627
use jiff::tz::TimeZone;
@@ -196,8 +197,10 @@ fn build_timestamp_tz_from_components(
196197
year, month, day
197198
))
198199
})?;
200+
let time = Time::new(hour as i8, minute as i8, second as i8, 0)
201+
.map_err(|err| ErrorCode::BadBytes(format!("Invalid time value: {}", err)))?;
199202
let mut zoned = date
200-
.at(hour as i8, minute as i8, second as i8, 0)
203+
.to_datetime(time)
201204
.to_zoned(TimeZone::UTC)
202205
.map_err(|err| ErrorCode::BadBytes(format!("Invalid time value: {}", err)))?;
203206
if micro > 0 {

src/query/service/src/physical_plans/physical_mutation.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -281,14 +281,20 @@ impl PhysicalPlanBuilder {
281281
}
282282
}
283283
}
284+
// Not-matched expressions are evaluated on the not-matched branch directly.
285+
// That branch may bypass RowFetch, so these columns must be in the mutation input.
286+
let mut unmatched_required = BTreeSet::new();
284287
for unmatched_evaluator in unmatched_evaluators {
285288
if let Some(condition) = &unmatched_evaluator.condition {
286289
maybe_udfs.extend(condition.used_columns());
290+
unmatched_required.extend(condition.used_columns());
287291
}
288292
for value in &unmatched_evaluator.values {
289293
maybe_udfs.extend(value.used_columns());
294+
unmatched_required.extend(value.used_columns());
290295
}
291296
}
297+
required.extend(unmatched_required);
292298
for filter_value in direct_filter {
293299
maybe_udfs.extend(filter_value.used_columns());
294300
}
@@ -486,17 +492,20 @@ impl PhysicalPlanBuilder {
486492
);
487493

488494
for item in unmatched_evaluators {
495+
// The not-matched branch may bypass RowFetch (see RowFetch on MutationSplit),
496+
// so expressions must be bound against the original mutation input schema.
489497
let filter = if let Some(filter_expr) = &item.condition {
490-
Some(self.scalar_expr_to_remote_expr(filter_expr, output_schema.clone())?)
498+
Some(self.scalar_expr_to_remote_expr(filter_expr, mutation_input_schema.clone())?)
491499
} else {
492500
None
493501
};
494502

495503
let mut values_exprs = Vec::<RemoteExpr>::with_capacity(item.values.len());
496504

497505
for scalar_expr in &item.values {
498-
values_exprs
499-
.push(self.scalar_expr_to_remote_expr(scalar_expr, output_schema.clone())?)
506+
values_exprs.push(
507+
self.scalar_expr_to_remote_expr(scalar_expr, mutation_input_schema.clone())?,
508+
)
500509
}
501510

502511
unmatched.push((item.source_schema.clone(), filter, values_exprs))

0 commit comments

Comments
 (0)