-
Notifications
You must be signed in to change notification settings - Fork 653
Add support for C-style comments #2034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -898,6 +898,12 @@ pub trait Dialect: Debug + Any { | |||||||||||||
false | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/// Returns true if the dialect supports hint and C-style comments | ||||||||||||||
/// e.g. `/*! hint */` | ||||||||||||||
fn supports_c_style_hints(&self) -> bool { | ||||||||||||||
Comment on lines
+901
to
+903
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah to clarify the suggested change, c-style comment I think is rather understood as |
||||||||||||||
false | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` | ||||||||||||||
/// as an alias assignment operator, rather than a boolean expression. | ||||||||||||||
/// For example: the following statements are equivalent for such a dialect: | ||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -810,6 +810,8 @@ pub struct Tokenizer<'a> { | |
/// If true (the default), the tokenizer will un-escape literal | ||
/// SQL strings See [`Tokenizer::with_unescape`] for more details. | ||
unescape: bool, | ||
/// Tokens injected back into the stream (e.g. from MySQL C-style hints) | ||
pending_tokens: Vec<Token>, | ||
} | ||
|
||
impl<'a> Tokenizer<'a> { | ||
|
@@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> { | |
dialect, | ||
query, | ||
unescape: true, | ||
pending_tokens: Vec::new(), | ||
} | ||
} | ||
|
||
|
@@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> { | |
|
||
/// Get the next token or return None | ||
fn next_token( | ||
&self, | ||
&mut self, | ||
chars: &mut State, | ||
prev_token: Option<&Token>, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
// Return any previously injected tokens first | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. impl wise I'm thinking we can do something like the following that would be less invasive to the tokenizer: If we take as an example Then the idea would be to essentially re-tokenize that string and add to the tokens buffer. So something like this: // we check if this is a token containing optimizer hints
match token {
Token::Whitespace(Whitespace::MultiLineComment(comment)) if self.dialect.supports_multiline_comment_hints() && s.starts_with("!") => {
// re-tokenize the hints
buf.extend(tokenize_comment_hints(comment, span)?);
token => {
buf.push(TokenWithSpan{ token, span })
}
}
// here we reuse existing tokenizer machinery to re-tokenize the hints.
fn tokenize_comment_hints(&self, hints: String, span: Span, tokens: &mut Vec<TokenWithSpan>) -> Result<()> {
let mut state = State {
peekable: hints.chars().peekable(),
line: span.start.line,
col: span.start.column,
};
while let Some(token) = self.next_token(&mut state, None)? {
tokens.push(token);
}
Ok(())
} |
||
{ | ||
if let Some(tok) = self.pending_tokens.pop() { | ||
return Ok(Some(tok)); | ||
} | ||
} | ||
match chars.peek() { | ||
Some(&ch) => match ch { | ||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), | ||
|
@@ -2102,13 +2111,14 @@ impl<'a> Tokenizer<'a> { | |
} | ||
|
||
fn tokenize_multiline_comment( | ||
&self, | ||
&mut self, | ||
chars: &mut State, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
let mut s = String::new(); | ||
let mut nested = 1; | ||
let mut c_style_comments = false; | ||
let supports_nested_comments = self.dialect.supports_nested_comments(); | ||
|
||
let supports_c_style_comments = self.dialect.supports_c_style_hints(); | ||
loop { | ||
match chars.next() { | ||
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { | ||
|
@@ -2117,10 +2127,24 @@ impl<'a> Tokenizer<'a> { | |
s.push('*'); | ||
nested += 1; | ||
} | ||
Some('!') if supports_c_style_comments => { | ||
c_style_comments = true; | ||
// consume only version digits (leave following whitespace/content intact) | ||
while let Some(&c) = chars.peek() { | ||
if c.is_ascii_digit() { | ||
chars.next(); | ||
} else { | ||
break; | ||
} | ||
} | ||
} | ||
Some('*') if matches!(chars.peek(), Some('/')) => { | ||
chars.next(); // consume the '/' | ||
nested -= 1; | ||
if nested == 0 { | ||
if c_style_comments { | ||
break self.inject_tokens_from_c_style_hints_and_return_first(s); | ||
} | ||
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); | ||
} | ||
s.push('*'); | ||
|
@@ -2139,6 +2163,26 @@ impl<'a> Tokenizer<'a> { | |
} | ||
} | ||
|
||
/// Tokenize the given string using the same dialect/unescape settings and inject | ||
/// the resulting tokens back into this tokenizer so they are returned before | ||
/// any further characters from the main stream. Returns the first injected token. | ||
fn inject_tokens_from_c_style_hints_and_return_first( | ||
&mut self, | ||
inner_sql: String, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
let trimmed = inner_sql.trim(); | ||
if trimmed.is_empty() { | ||
return Ok(None); | ||
} | ||
let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape); | ||
let tokens = inner.tokenize()?; | ||
// push in reverse so we can pop from the end efficiently | ||
for t in tokens.into_iter().rev() { | ||
self.pending_tokens.push(t); | ||
} | ||
Ok(self.pending_tokens.pop()) | ||
} | ||
|
||
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) { | ||
let mut last_char = None; | ||
let mut s = String::new(); | ||
|
@@ -4070,4 +4114,44 @@ mod tests { | |
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); | ||
} | ||
} | ||
#[test] | ||
fn tokenize_multiline_comment_with_c_style_comment() { | ||
let sql = String::from("0/*! word */1"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at their docs, I'm wondering if/how we support these examples? SELECT /*! STRAIGHT_JOIN */ col1 FROM table1,table2
/*!50110 KEY_BLOCK_SIZE=1024 */
SELECT /*! BKA(t1) */ FROM T There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iffyio - Parsing the c_style comment unblocks sqlparser to not discard those as if they were a normal comment. Support for each hint will have to be added in a case by case bases. For example #2033 - MySQL adds a c-style comment if you run SHOW CREATE TABLE:
Without the current patch, the invisible keyword will be discarded. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah so to clarify I'm rather wondering regarding the parser behavior for hints that aren't singe words e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iffyio thanks flagging this. I have fixed the issue and now we properly return individual tokens inside a C-style hint comment. |
||
|
||
let dialect = MySqlDialect {}; | ||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); | ||
let expected = vec![ | ||
Token::Number("0".to_string(), false), | ||
Token::Word(Word { | ||
value: "word".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::NoKeyword, | ||
}), | ||
Token::Number("1".to_string(), false), | ||
]; | ||
compare(expected, tokens); | ||
} | ||
|
||
#[test] | ||
fn tokenize_multiline_comment_with_c_style_comment_and_version() { | ||
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1"); | ||
let dialect = MySqlDialect {}; | ||
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap(); | ||
let expected = vec![ | ||
Token::Number("0".to_string(), false), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Word(Word { | ||
value: "KEY_BLOCK_SIZE".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::KEY_BLOCK_SIZE, | ||
}), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Eq, | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Number("1024".to_string(), false), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Number("1".to_string(), false), | ||
]; | ||
compare(expected, tokens); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.