Add support for C-style comments

altmannmarcelo · altmannmarcelo · commit 4123881952e2 · 2025-09-17T12:26:44.000-03:00
This commit adds support for C-style comments supported by MySQL.
It parses and consumes the optional version number after the `!`
character.
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
@@ -156,6 +156,10 @@ impl Dialect for GenericDialect {
         true
     }
 
+    fn supports_c_style_comments(&self) -> bool {
+        true
+    }
+
     fn supports_user_host_grantee(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -898,6 +898,11 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports hint and C-style comments
+    fn supports_c_style_comments(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a dialect:
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -84,6 +84,11 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_c_style_comments(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -2107,8 +2107,9 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
+        let mut c_style_comments = false;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-
+        let supports_c_style_comments = self.dialect.supports_c_style_comments();
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -2117,10 +2118,37 @@ impl<'a> Tokenizer<'a> {
                     s.push('*');
                     nested += 1;
                 }
+                Some('!') if supports_c_style_comments => {
+                    c_style_comments = true;
+                    while let Some('0') | Some('1') | Some('2') | Some('3') | Some('4')
+                    | Some('5') | Some('6') | Some('7') | Some('8') | Some('9') = chars.peek()
+                    {
+                        chars.next(); // consume the digit
+                    }
+                }
+                // consume all leading whitespaces until the '*/' character if in a C-style comment
+                Some(ch) if ch.is_whitespace() && c_style_comments => {
+                    let mut ws_count = 0;
+                    while let Some(&c) = chars.peek() {
+                        if c.is_whitespace() {
+                            ws_count += 1;
+                        } else if c == '*' && chars.peek() == Some(&'/') {
+                            for _ in 0..ws_count {
+                                chars.next();
+                            }
+                            break;
+                        } else {
+                            break;
+                        }
+                    }
+                }
                 Some('*') if matches!(chars.peek(), Some('/')) => {
                     chars.next(); // consume the '/'
                     nested -= 1;
                     if nested == 0 {
+                        if c_style_comments {
+                            break Ok(Some(Token::make_word(&s, None)));
+                        }
                         break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                     }
                     s.push('*');
@@ -4070,4 +4098,39 @@ mod tests {
             panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
         }
     }
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment_and_version() {
+        let sql = String::from("0/*!8000000 word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,10 @@ impl Dialect for GenericDialect {`
`156`	`156`	`true`
`157`	`157`	`}`
`158`	`158`
	`159`	`+ fn supports_c_style_comments(&self) -> bool {`
	`160`	`+ true`
	`161`	`+ }`
	`162`	`+`
`159`	`163`	`fn supports_user_host_grantee(&self) -> bool {`
`160`	`164`	`true`
`161`	`165`	`}`