From dc80721f88805dbff85b84ae1f097d75977a6446 Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 16:59:51 -0700 Subject: [PATCH 1/3] Correctly tokenize nested comments in Databricks --- src/dialect/databricks.rs | 5 ++ src/tokenizer.rs | 141 ++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 76 deletions(-) diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs index 261133d19..74e379891 100644 --- a/src/dialect/databricks.rs +++ b/src/dialect/databricks.rs @@ -79,4 +79,9 @@ impl Dialect for DatabricksDialect { fn supports_struct_literal(&self) -> bool { true } + + // https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment + fn supports_nested_comments(&self) -> bool { + true + } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 428bb148b..3740405c0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2497,7 +2497,7 @@ mod tests { use crate::dialect::{ BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect, }; - use crate::test_utils::all_dialects_where; + use crate::test_utils::{all_dialects_except, all_dialects_where}; use core::fmt::Debug; #[test] @@ -3247,90 +3247,79 @@ mod tests { #[test] fn tokenize_nested_multiline_comment() { - let dialect = GenericDialect {}; - let test_cases = vec![ - ( - "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", - vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), - )), - Token::Whitespace(Whitespace::Space), - Token::Div, - Token::Word(Word { - value: "comment".to_string(), - quote_style: None, - keyword: Keyword::COMMENT, - }), - Token::Mul, - Token::Div, - Token::Number("1".to_string(), false), - ], - ), - ( - "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", - vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), - )), - Token::Number("1".to_string(), false), - ], - ), - ( - "SELECT 1/* a /* b */ c */0", - vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), - Token::Number("0".to_string(), false), - ], - ), - ]; + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), + )), + Token::Whitespace(Whitespace::Space), + Token::Div, + Token::Word(Word { + value: "comment".to_string(), + quote_style: None, + keyword: Keyword::COMMENT, + }), + Token::Mul, + Token::Div, + Token::Number("1".to_string(), false), + ], + ); - for (sql, expected) in test_cases { - let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - compare(expected, tokens); - } + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), + )), + Token::Number("1".to_string(), false), + ], + ); + + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "SELECT 1/* a /* b */ c */0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), + Token::Number("0".to_string(), false), + ], + ); } #[test] fn tokenize_nested_multiline_comment_empty() { - let sql = "select 1/*/**/*/0"; - - let dialect = GenericDialect {}; - let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![ - Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), - Token::Number("0".to_string(), false), - ]; - - compare(expected, tokens); + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "select 1/*/**/*/0", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), + Token::Number("0".to_string(), false), + ], + ); } #[test] fn tokenize_nested_comments_if_not_supported() { - let dialect = SQLiteDialect {}; - let sql = "SELECT 1/*/* nested comment */*/0"; - let tokens = Tokenizer::new(&dialect, sql).tokenize(); - let expected = vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "/* nested comment ".to_string(), - )), - Token::Mul, - Token::Div, - Token::Number("0".to_string(), false), - ]; - - compare(expected, tokens.unwrap()); + all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to( + "SELECT 1/*/* nested comment */*/0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "/* nested comment ".to_string(), + )), + Token::Mul, + Token::Div, + Token::Number("0".to_string(), false), + ], + ); } #[test] From e81ee582801bd292e59156edd0c00926e133dd6d Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 17:08:28 -0700 Subject: [PATCH 2/3] clickhouse too --- src/dialect/clickhouse.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs index f5e70c309..aeeef6d64 100644 --- a/src/dialect/clickhouse.rs +++ b/src/dialect/clickhouse.rs @@ -94,4 +94,10 @@ impl Dialect for ClickHouseDialect { fn supports_group_by_with_modifier(&self) -> bool { true } + + // Supported since 2020. + // See + fn supports_nested_comments(&self) -> bool { + true + } } From 71865d03755de3f36a3d23c602d89eb53d2f041b Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 17:12:36 -0700 Subject: [PATCH 3/3] believe it or not, ansi --- src/dialect/ansi.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs index 32ba7b32a..ce1755a34 100644 --- a/src/dialect/ansi.rs +++ b/src/dialect/ansi.rs @@ -33,4 +33,9 @@ impl Dialect for AnsiDialect { fn require_interval_qualifier(&self) -> bool { true } + + // The SQL standard explictly states that block comments nest. + fn supports_nested_comments(&self) -> bool { + true + } }