From f5f916bafaf24430affb2a0be4431bd63e688fce Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Wed, 8 Jan 2025 09:46:03 +0100 Subject: [PATCH] Start new line if `\r` and dialect is postgres Currently the tokenizer throws an error for ```sql insert into cats_2 (petname) values ('foo'),--\r(version()||'\n'); ``` this is because postgres treats \r as a separate new line character, see https://github.com/postgres/postgres/blob/master/src/backend/parser/scan.l > In order to make the world safe for Windows and Mac clients as well as Unix ones, we accept either \n or \r as a newline. > A DOS-style \r\n sequence will be seen as two successive newlines, but that doesn't cause any problems. > non_newline [^\n\r] > comment ("--"{non_newline}*) Let's make sure we start a new line if we encounter a \r when tokenizing a comment. --- src/tokenizer.rs | 63 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9103f22ea..54f3d3316 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1390,11 +1390,17 @@ impl<'a> Tokenizer<'a> { // Consume characters until newline fn tokenize_single_line_comment(&self, chars: &mut State) -> String { - let mut comment = peeking_take_while(chars, |ch| ch != '\n'); + let mut comment = peeking_take_while(chars, |ch| match ch { + '\n' => false, // Always stop at \n + '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres + _ => true, // Keep consuming for other characters + }); + if let Some(ch) = chars.next() { - assert_eq!(ch, '\n'); + assert!(ch == '\n' || ch == '\r'); comment.push(ch); } + comment } @@ -2425,17 +2431,62 @@ mod tests { #[test] fn tokenize_comment() { - let sql = String::from("0--this is a comment\n1"); + let test_cases = vec![ + ( + String::from("0--this is a comment\n1"), + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::SingleLineComment { + prefix: "--".to_string(), + comment: "this is a comment\n".to_string(), + }), + Token::Number("1".to_string(), false), + ], + ), + ( + String::from("0--this is a comment\r1"), + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::SingleLineComment { + prefix: "--".to_string(), + comment: "this is a comment\r1".to_string(), + }), + ], + ), + ( + String::from("0--this is a comment\r\n1"), + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::SingleLineComment { + prefix: "--".to_string(), + comment: "this is a comment\r\n".to_string(), + }), + Token::Number("1".to_string(), false), + ], + ), + ]; let dialect = GenericDialect {}; + + for (sql, expected) in test_cases { + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + compare(expected, tokens); + } + } + + #[test] + fn tokenize_comment_postgres() { + let sql = String::from("1--\r0"); + + let dialect = PostgreSqlDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ - Token::Number("0".to_string(), false), + Token::Number("1".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_string(), - comment: "this is a comment\n".to_string(), + comment: "\r".to_string(), }), - Token::Number("1".to_string(), false), + Token::Number("0".to_string(), false), ]; compare(expected, tokens); }