Merge branch 'main' into sqllogic-test-split

dqhl76 · web-flow · commit 0016c161d8fc · 2026-03-12T16:44:29.000+08:00
diff --git a/.github/actions/test_compat_client_cluster/action.yml b/.github/actions/test_compat_client_cluster/action.yml
@@ -25,7 +25,7 @@ runs:
     - name: Setup Go
       uses: actions/setup-go@v5
       with:
-        go-version: "1.24"
+        go-version: "1.25"
 
     - name: checkout databend-jdbc
       uses: actions/checkout@v4
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/common/hashtable/src/README.md b/src/common/hashtable/src/README.md
@@ -1,5 +1,4 @@
 # Hashtable implementation in rust
 
-
-This package is a port from its cpp implementation from [ClickHouse](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/HashTable.h).
+This package is a port from its cpp implementation from [ClickHouse].
 It is quite difficult to implement a high-performance hashtable from scratch. After comparing many implementations of hashtables, we found that ClickHouse's hashtable is the most suitable for OLAP systems. Therefore, we rewrote it into this Rust library.
diff --git a/src/common/io/src/cursor_ext/cursor_read_datetime_ext.rs b/src/common/io/src/cursor_ext/cursor_read_datetime_ext.rs
@@ -437,7 +437,11 @@ fn get_local_time(tz: &TimeZone, d: &Date, times: &mut Vec<u32>) -> Result<Zoned
     if let Some(zoned) = fast_local_to_zoned(tz, d, hour, minute, second, 0) {
         return Ok(zoned);
     }
-    tz.to_zoned(d.at(hour as i8, minute as i8, second as i8, 0))
+    let time = Time::new(hour as i8, minute as i8, second as i8, 0)
+        .map_err_to_code(ErrorCode::BadBytes, || {
+            format!("Invalid time {:02}:{:02}:{:02}", hour, minute, second)
+        })?;
+    tz.to_zoned(d.to_datetime(time))
         .map_err_to_code(ErrorCode::BadBytes, || {
             format!("Invalid time provided in times: {:?}", times)
         })
@@ -490,9 +494,23 @@ fn build_zoned_from_components(
     if let Some(zoned) = fast_local_to_zoned(tz, date, hour, minute, second, micro) {
         return Ok(zoned);
     }
+    let time = Time::new(hour as i8, minute as i8, second as i8, 0).map_err_to_code(
+        ErrorCode::BadBytes,
+        || {
+            format!(
+                "Invalid local time {:04}-{:02}-{:02} {:02}:{:02}:{:02}",
+                i32::from(date.year()),
+                date.month(),
+                date.day(),
+                hour,
+                minute,
+                second
+            )
+        },
+    )?;
 
     let base = tz
-        .to_zoned(date.at(hour as i8, minute as i8, second as i8, 0))
+        .to_zoned(date.to_datetime(time))
         .map_err_to_code(ErrorCode::BadBytes, || {
             format!(
                 "Invalid local time {:04}-{:02}-{:02} {:02}:{:02}:{:02}",
diff --git a/src/query/ast/Cargo.toml b/src/query/ast/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "databend-common-ast"
-version = "0.2.4"
+version = "0.2.5"
 publish = true
 description = "SQL parser for Databend"
 authors = { workspace = true }
diff --git a/src/query/ast/src/ast/quote.rs b/src/query/ast/src/ast/quote.rs
@@ -18,9 +18,12 @@ use std::iter::Peekable;
 use std::str::FromStr;
 
 use crate::parser::Dialect;
+use crate::parser::token::is_ident_continue;
+use crate::parser::token::is_ident_start;
 
-// In ANSI SQL, it does not need to quote an identifier if the identifier matches
-// the following regular expression: [A-Za-z_][A-Za-z0-9_$]*.
+// An identifier does not need quoting if it matches the rules defined by
+// is_ident_start / is_ident_continue (ASCII letters, underscore, digits, `$`,
+// plus any Unicode Alphabetic character — CJK, Cyrillic, etc.).
 //
 // There are also two known special cases in Databend which do not require quoting:
 // - "~" is a valid stage name
@@ -37,12 +40,12 @@ pub fn ident_needs_quote(ident: &str) -> bool {
 
     let mut chars = ident.chars();
     let first = chars.next().unwrap();
-    if !first.is_ascii_alphabetic() && first != '_' {
+    if !is_ident_start(first) {
         return true;
     }
 
     for c in chars {
-        if !c.is_ascii_alphanumeric() && c != '_' && c != '$' {
+        if !is_ident_continue(c) {
             return true;
         }
     }
@@ -57,7 +60,11 @@ pub fn display_ident(
     dialect: Dialect,
 ) -> String {
     // Db-s -> "Db-s" ; dbs -> dbs
-    if name.chars().any(|c| c.is_ascii_uppercase()) && quoted_ident_case_sensitive
+    // Quote the identifier if it would change under to_lowercase(), so that
+    // round-tripping through normalize_identifier (which lowercases unquoted
+    // idents) preserves the original name.  This covers both uppercase (Lu)
+    // and titlecase (Lt) Unicode characters — e.g. ǅ (U+01C5).
+    if name != name.to_lowercase() && quoted_ident_case_sensitive
         || ident_needs_quote(name)
         || force_quoted_ident
     {
@@ -73,7 +80,7 @@ pub fn ident_opt_quote(
     quoted_ident_case_sensitive: bool,
     dialect: Dialect,
 ) -> Option<char> {
-    if name.chars().any(|c| c.is_ascii_uppercase()) && quoted_ident_case_sensitive
+    if name != name.to_lowercase() && quoted_ident_case_sensitive
         || ident_needs_quote(name)
         || force_quoted_ident
     {
diff --git a/src/query/ast/src/parser/token.rs b/src/query/ast/src/parser/token.rs
@@ -137,6 +137,45 @@ impl<'a> Iterator for Tokenizer<'a> {
     }
 }
 
+/// Returns true if `c` can start an unquoted identifier.
+///
+/// ASCII letters and underscore are accepted, plus any Unicode character with
+/// the Alphabetic derived property (`char::is_alphabetic()` — covers CJK,
+/// Cyrillic, Arabic, etc. while excluding separators, punctuation, and symbols).
+pub fn is_ident_start(c: char) -> bool {
+    c == '_' || c.is_alphabetic()
+}
+
+/// Returns true if `c` can continue an unquoted identifier.
+///
+/// Same as `is_ident_start`, plus ASCII digits and `$`.
+/// Non-ASCII digits are also accepted via `char::is_alphanumeric()`.
+pub fn is_ident_continue(c: char) -> bool {
+    c == '_' || c == '$' || c.is_alphanumeric()
+}
+
+fn bump_ident_continue(lex: &mut Lexer<TokenKind>) {
+    loop {
+        match lex.remainder().chars().next() {
+            Some(c) if is_ident_continue(c) => lex.bump(c.len_utf8()),
+            _ => break,
+        }
+    }
+}
+
+fn lex_ascii_ident(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
+    bump_ident_continue(lex);
+    logos::FilterResult::Emit(())
+}
+
+fn lex_unicode_ident(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
+    if !is_ident_start(lex.slice().chars().next().unwrap()) {
+        return logos::FilterResult::Error;
+    }
+    bump_ident_continue(lex);
+    logos::FilterResult::Emit(())
+}
+
 fn lex_comment_block(lex: &mut Lexer<TokenKind>) -> logos::FilterResult<()> {
     let remainder = lex.remainder().as_bytes();
 
@@ -167,7 +206,8 @@ pub enum TokenKind {
     #[token("/*", lex_comment_block)]
     CommentBlock,
 
-    #[regex(r#"[_a-zA-Z][_$a-zA-Z0-9]*"#)]
+    #[regex(r#"[_a-zA-Z][_$a-zA-Z0-9]*"#, lex_ascii_ident)]
+    #[regex(r"[^\x00-\x7f]", lex_unicode_ident)]
     Ident,
 
     #[regex(r#"\$[_a-zA-Z][_$a-zA-Z0-9]*"#)]
diff --git a/src/query/ast/tests/it/parser.rs b/src/query/ast/tests/it/parser.rs
@@ -1738,7 +1738,7 @@ fn test_quote() {
         ("a\\\"b", "\"a\\\"\"b\""),
         ("12", "\"12\""),
         ("🍣", "\"🍣\""),
-        ("価格", "\"価格\""),
+        ("価格", "価格"),
         ("\t", "\"\t\""),
         ("complex \"string\"", "\"complex \"\"string\"\"\""),
         ("\"\"\"", "\"\"\"\"\"\"\"\""),
@@ -1759,3 +1759,43 @@ fn test_quote() {
         };
     }
 }
+
+#[test]
+fn test_unicode_ident_tokenize() {
+    let cases = &[
+        ("中文", vec![(TokenKind::Ident, "中文")]),
+        ("価格", vec![(TokenKind::Ident, "価格")]),
+        ("SELECT 'a' AS 中文", vec![
+            (TokenKind::SELECT, "SELECT"),
+            (TokenKind::LiteralString, "'a'"),
+            (TokenKind::AS, "AS"),
+            (TokenKind::Ident, "中文"),
+        ]),
+        // Mixed ASCII and Unicode
+        ("abc中文", vec![(TokenKind::Ident, "abc中文")]),
+        ("abc中文123", vec![(TokenKind::Ident, "abc中文123")]),
+        ("_中文", vec![(TokenKind::Ident, "_中文")]),
+        ("_列名_1", vec![(TokenKind::Ident, "_列名_1")]),
+        // Non-BMP but Alphabetic (CJK Extension B U+20000) — still valid
+        ("𠀀", vec![(TokenKind::Ident, "𠀀")]),
+        ("abc𠀀123", vec![(TokenKind::Ident, "abc𠀀123")]),
+        ("_𠀀$1", vec![(TokenKind::Ident, "_𠀀$1")]),
+    ];
+
+    for (input, expected) in cases {
+        let tokens: Vec<_> = Tokenizer::new(input)
+            .map(|t| t.unwrap())
+            .filter(|t| t.kind != TokenKind::EOI)
+            .map(|t| (t.kind, t.text().to_string()))
+            .collect();
+        let tokens: Vec<_> = tokens.iter().map(|(k, s)| (*k, s.as_str())).collect();
+        assert_eq!(tokens, *expected, "input: {input}");
+    }
+
+    // Emoji is not Alphabetic, tokenizer must return an error
+    let err_cases = &["🍣"];
+    for input in err_cases {
+        let result: std::result::Result<Vec<_>, _> = Tokenizer::new(input).collect();
+        assert!(result.is_err(), "expected error for input: {input}");
+    }
+}
diff --git a/src/query/expression/src/types/timestamp_tz.rs b/src/query/expression/src/types/timestamp_tz.rs
@@ -21,6 +21,7 @@ use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_io::datetime::parse_standard_timestamp as parse_iso_timestamp;
 use jiff::civil::Date;
+use jiff::civil::Time;
 use jiff::fmt;
 use jiff::tz;
 use jiff::tz::TimeZone;
@@ -196,8 +197,10 @@ fn build_timestamp_tz_from_components(
             year, month, day
         ))
     })?;
+    let time = Time::new(hour as i8, minute as i8, second as i8, 0)
+        .map_err(|err| ErrorCode::BadBytes(format!("Invalid time value: {}", err)))?;
     let mut zoned = date
-        .at(hour as i8, minute as i8, second as i8, 0)
+        .to_datetime(time)
         .to_zoned(TimeZone::UTC)
         .map_err(|err| ErrorCode::BadBytes(format!("Invalid time value: {}", err)))?;
     if micro > 0 {
diff --git a/src/query/service/src/physical_plans/physical_mutation.rs b/src/query/service/src/physical_plans/physical_mutation.rs
@@ -281,14 +281,20 @@ impl PhysicalPlanBuilder {
                 }
             }
         }
+        // Not-matched expressions are evaluated on the not-matched branch directly.
+        // That branch may bypass RowFetch, so these columns must be in the mutation input.
+        let mut unmatched_required = BTreeSet::new();
         for unmatched_evaluator in unmatched_evaluators {
             if let Some(condition) = &unmatched_evaluator.condition {
                 maybe_udfs.extend(condition.used_columns());
+                unmatched_required.extend(condition.used_columns());
             }
             for value in &unmatched_evaluator.values {
                 maybe_udfs.extend(value.used_columns());
+                unmatched_required.extend(value.used_columns());
             }
         }
+        required.extend(unmatched_required);
         for filter_value in direct_filter {
             maybe_udfs.extend(filter_value.used_columns());
         }
@@ -486,17 +492,20 @@ impl PhysicalPlanBuilder {
             );
 
         for item in unmatched_evaluators {
+            // The not-matched branch may bypass RowFetch (see RowFetch on MutationSplit),
+            // so expressions must be bound against the original mutation input schema.
             let filter = if let Some(filter_expr) = &item.condition {
-                Some(self.scalar_expr_to_remote_expr(filter_expr, output_schema.clone())?)
+                Some(self.scalar_expr_to_remote_expr(filter_expr, mutation_input_schema.clone())?)
             } else {
                 None
             };
 
             let mut values_exprs = Vec::<RemoteExpr>::with_capacity(item.values.len());
 
             for scalar_expr in &item.values {
-                values_exprs
-                    .push(self.scalar_expr_to_remote_expr(scalar_expr, output_schema.clone())?)
+                values_exprs.push(
+                    self.scalar_expr_to_remote_expr(scalar_expr, mutation_input_schema.clone())?,
+                )
             }
 
             unmatched.push((item.source_schema.clone(), filter, values_exprs))
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs
@@ -111,7 +111,11 @@ impl Join for OuterLeftHashJoin {
                 .map(|x| x.data_type().clone())
                 .collect::<Vec<_>>();
 
-            let build_block = null_block(&types, data.num_rows());
+            let build_block = match null_block(&types, data.num_rows()) {
+                None => None,
+                Some(data_block) => Some(data_block.project(&self.desc.build_projection)),
+            };
+
             let probe_block = Some(data.project(&self.desc.probe_projection));
             let result_block = final_result_block(&self.desc, probe_block, build_block, num_rows);
             return Ok(Box::new(OneBlockJoinStream(Some(result_block))));
diff --git a/src/query/service/tests/it/indexes/aggregating_index/index_refresh.rs b/src/query/service/tests/it/indexes/aggregating_index/index_refresh.rs
@@ -535,10 +535,21 @@ async fn test_sync_agg_index_after_copy_into() -> anyhow::Result<()> {
     let ctx = fixture.new_query_ctx().await?;
     let index_id0 = create_index(ctx, index_name, original_query, query.as_str(), true).await?;
 
-    // Copy into data
-    let _ =fixture.execute_query(
-        "COPY INTO books FROM 'https://datafuse-1253727613.cos.ap-hongkong.myqcloud.com/data/books.csv' FILE_FORMAT = (TYPE = CSV);",
-    )
+    // Create a stage and populate it with CSV data, then COPY INTO the table.
+    fixture.execute_command("CREATE STAGE books_stage").await?;
+    fixture
+        .execute_command(
+            "COPY INTO @books_stage FROM (SELECT * FROM (VALUES \
+             ('Transaction Processing', 'Jim Gray', '1992'), \
+             ('Readings in Database Systems', 'Michael Stonebraker', 'NULL'), \
+             ('Three Body', 'NULL-liucixin', '2019')) t(title, author, date)) \
+             FILE_FORMAT = (TYPE = CSV)",
+        )
+        .await?;
+    fixture
+        .execute_command(
+            "COPY INTO books FROM @books_stage FILE_FORMAT = (TYPE = CSV) PURGE = true FORCE = true",
+        )
         .await?;
 
     let root = fixture.storage_root();
diff --git a/src/query/sql/src/planner/semantic/name_resolution.rs b/src/query/sql/src/planner/semantic/name_resolution.rs
@@ -41,7 +41,7 @@ pub enum NameResolutionSuggest {
 
 impl NameResolutionContext {
     pub fn not_found_suggest(&self, ident: &Identifier) -> Option<NameResolutionSuggest> {
-        if !ident.name.chars().any(|c| c.is_ascii_uppercase()) {
+        if !ident.name.chars().any(|c| c.is_uppercase()) {
             return None;
         }
         match (
diff --git a/src/query/sql/src/planner/semantic/tests/identifier_quote_test.rs b/src/query/sql/src/planner/semantic/tests/identifier_quote_test.rs
@@ -15,6 +15,7 @@
 mod tests {
 
     use databend_common_ast::ast::Identifier;
+    use databend_common_ast::ast::quote::display_ident;
     use databend_common_ast::parser::Dialect;
     use databend_common_ast::parser::parse_sql;
     use databend_common_ast::parser::tokenize_sql;
@@ -185,4 +186,38 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_display_ident_quotes_unicode_uppercase_when_case_sensitive() -> anyhow::Result<()> {
+        let context = NameResolutionContext {
+            unquoted_ident_case_sensitive: false,
+            quoted_ident_case_sensitive: true,
+            deny_column_reference: false,
+        };
+
+        let rendered = display_ident(
+            "Ж",
+            false,
+            context.quoted_ident_case_sensitive,
+            Dialect::PostgreSQL,
+        );
+        assert_eq!(
+            rendered, r#""Ж""#,
+            "Unicode uppercase identifiers must stay quoted in output"
+        );
+
+        let reparsed_unquoted = Identifier {
+            span: Default::default(),
+            name: "Ж".to_string(),
+            quote: None,
+            ident_type: Default::default(),
+        };
+        let normalized = normalize_identifier(&reparsed_unquoted, &context);
+        assert_eq!(
+            normalized.name, "ж",
+            "Unquoted reparsing lowercases the identifier, so output must preserve quotes"
+        );
+
+        Ok(())
+    }
 }
diff --git a/tests/sqllogictests/suites/base/03_common/03_0052_join_with_fast_returuning.test b/tests/sqllogictests/suites/base/03_common/03_0052_join_with_fast_returuning.test
diff --git a/tests/sqllogictests/suites/base/issues/issue_16885.test b/tests/sqllogictests/suites/base/issues/issue_16885.test
diff --git a/tests/sqllogictests/suites/query/alias/unicode_ident.test b/tests/sqllogictests/suites/query/alias/unicode_ident.test
diff --git a/tests/sqllogictests/suites/query/functions/02_0012_function_datetimes.test b/tests/sqllogictests/suites/query/functions/02_0012_function_datetimes.test

Original file line number	Diff line number	Diff line change
`@@ -281,14 +281,20 @@ impl PhysicalPlanBuilder {`
`281`	`281`	`}`
`282`	`282`	`}`
`283`	`283`	`}`
	`284`	`+ // Not-matched expressions are evaluated on the not-matched branch directly.`
	`285`	`+ // That branch may bypass RowFetch, so these columns must be in the mutation input.`
	`286`	`+ let mut unmatched_required = BTreeSet::new();`
`284`	`287`	`for unmatched_evaluator in unmatched_evaluators {`
`285`	`288`	`if let Some(condition) = &unmatched_evaluator.condition {`
`286`	`289`	`maybe_udfs.extend(condition.used_columns());`
	`290`	`+ unmatched_required.extend(condition.used_columns());`
`287`	`291`	`}`
`288`	`292`	`for value in &unmatched_evaluator.values {`
`289`	`293`	`maybe_udfs.extend(value.used_columns());`
	`294`	`+ unmatched_required.extend(value.used_columns());`
`290`	`295`	`}`
`291`	`296`	`}`
	`297`	`+ required.extend(unmatched_required);`
`292`	`298`	`for filter_value in direct_filter {`
`293`	`299`	`maybe_udfs.extend(filter_value.used_columns());`
`294`	`300`	`}`
`@@ -486,17 +492,20 @@ impl PhysicalPlanBuilder {`
`486`	`492`	`);`
`487`	`493`
`488`	`494`	`for item in unmatched_evaluators {`
	`495`	`+ // The not-matched branch may bypass RowFetch (see RowFetch on MutationSplit),`
	`496`	`+ // so expressions must be bound against the original mutation input schema.`
`489`	`497`	`let filter = if let Some(filter_expr) = &item.condition {`
`490`		`- Some(self.scalar_expr_to_remote_expr(filter_expr, output_schema.clone())?)`
	`498`	`+ Some(self.scalar_expr_to_remote_expr(filter_expr, mutation_input_schema.clone())?)`
`491`	`499`	`} else {`
`492`	`500`	`None`
`493`	`501`	`};`
`494`	`502`
`495`	`503`	`let mut values_exprs = Vec::<RemoteExpr>::with_capacity(item.values.len());`
`496`	`504`
`497`	`505`	`for scalar_expr in &item.values {`
`498`		`- values_exprs`
`499`		`- .push(self.scalar_expr_to_remote_expr(scalar_expr, output_schema.clone())?)`
	`506`	`+ values_exprs.push(`
	`507`	`+ self.scalar_expr_to_remote_expr(scalar_expr, mutation_input_schema.clone())?,`
	`508`	`+ )`
`500`	`509`	`}`
`501`	`510`
`502`	`511`	`unmatched.push((item.source_schema.clone(), filter, values_exprs))`