diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4aab9cee7..10716f3f2 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -65,7 +65,7 @@ jobs: with: crate: cargo-tarpaulin version: 0.14.2 - use-tool-cache: true + use-tool-cache: false - name: Test run: cargo test --all-features @@ -83,7 +83,7 @@ jobs: with: crate: cargo-tarpaulin version: 0.14.2 - use-tool-cache: true + use-tool-cache: false - name: Coverage run: cargo tarpaulin -o Lcov --output-dir ./coverage - name: Coveralls diff --git a/README.md b/README.md index 3226b9549..6ecaccfd2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ +# Patching SQL Parser for LakeSail + +1. Use `dev` as the base branch when creating PRs in the fork. +2. Please confirm the base repository when creating PRs. You should manually choose `lakehq/sqlparser-rs` when proposing changes to the fork. +3. For patching, use a squash commit to merge the PR. This ensures that each patch appears as a single commit in the `dev` branch of the fork. +4. For merging from upstream, use a merge commit to merge the PR. This ensures that the upstream history is kept in the `dev` branch of the fork. +5. Please avoid mixing code changes and upstream merge in a single PR. + # Extensible SQL Lexer and Parser for Rust [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 17c6202db..d7f1d5ace 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -706,7 +706,9 @@ impl<'a> Tokenizer<'a> { // BigQuery uses b or B for byte string literal b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => { chars.next(); // consume - match chars.peek() { + match peeking_skip_whitespace_take_if(chars, |ch| { + matches!(ch, '\'') || matches!(ch, '\"') + }) { Some('\'') => { if self.dialect.supports_triple_quoted_string() { return self @@ -745,7 +747,9 @@ impl<'a> Tokenizer<'a> { // BigQuery uses r or R for raw string literal b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => { chars.next(); // consume - match chars.peek() { + match peeking_skip_whitespace_take_if(chars, |ch| { + matches!(ch, '\'') || matches!(ch, '\"') + }) { Some('\'') => self .tokenize_single_or_triple_quoted_string:: Token>( chars, @@ -772,12 +776,19 @@ impl<'a> Tokenizer<'a> { // Redshift uses lower case n for national string literal n @ 'N' | n @ 'n' => { chars.next(); // consume, to check the next char - match chars.peek() { + match peeking_skip_whitespace_take_if(chars, |ch| { + matches!(ch, '\'') || matches!(ch, '\"') + }) { Some('\'') => { // N'...' - a let s = self.tokenize_single_quoted_string(chars, '\'', true)?; Ok(Some(Token::NationalStringLiteral(s))) } + Some('\"') => { + // N"..." - a + let s = self.tokenize_single_quoted_string(chars, '\"', true)?; + Ok(Some(Token::NationalStringLiteral(s))) + } _ => { // regular identifier starting with an "N" let s = self.tokenize_word(n, chars); @@ -789,7 +800,7 @@ impl<'a> Tokenizer<'a> { x @ 'e' | x @ 'E' => { let starting_loc = chars.location(); chars.next(); // consume, to check the next char - match chars.peek() { + match peeking_skip_whitespace_take_if(chars, |ch| matches!(ch, '\'')) { Some('\'') => { let s = self.tokenize_escaped_single_quoted_string(starting_loc, chars)?; @@ -823,12 +834,19 @@ impl<'a> Tokenizer<'a> { // string, but PostgreSQL, at least, allows a lowercase 'x' too. x @ 'x' | x @ 'X' => { chars.next(); // consume, to check the next char - match chars.peek() { + match peeking_skip_whitespace_take_if(chars, |ch| { + matches!(ch, '\'') || matches!(ch, '\"') + }) { Some('\'') => { // X'...' - a let s = self.tokenize_single_quoted_string(chars, '\'', true)?; Ok(Some(Token::HexStringLiteral(s))) } + Some('\"') => { + // X"..." - a + let s = self.tokenize_single_quoted_string(chars, '\"', true)?; + Ok(Some(Token::HexStringLiteral(s))) + } _ => { // regular identifier starting with an "X" let s = self.tokenize_word(x, chars); @@ -1674,6 +1692,47 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool s } +/// Peek ahead in a clone of `self.peekable`, skipping whitespace, +/// until `predicate` returns `true` or a non-whitespace character is encountered. +/// If a character matching the predicate is found: +/// - Advance the original iterator by the number of whitespace characters skipped +/// - Return the peeked character matching the predicate +/// +/// If a non-whitespace character not matching the predicate is encountered, or EOF is reached, +/// return `self.peek()` without advancing the iterator. +/// +/// Note: This function may advance the original iterator if a match is found after skipping whitespace. +fn peeking_skip_whitespace_take_if( + chars: &mut State, + mut predicate: impl FnMut(char) -> bool, +) -> Option { + // Check if the next character is a match to avoid unnecessary cloning. + if let Some(&ch) = chars.peek() { + if predicate(ch) { + return Some(ch); + } + } + + let mut chars_clone = chars.peekable.clone(); + let mut next_count = 0; + loop { + match chars_clone.peek() { + Some(&ch) if predicate(ch) => { + // Advance the original iterator + for _ in 0..next_count { + chars.next(); + } + return chars.peek().copied(); + } + Some(ch) if ch.is_whitespace() || matches!(ch, ' ' | '\t' | '\n' | '\r') => { + next_count += 1; + chars_clone.next(); + } + _ => return chars.peek().copied(), + } + } +} + fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { Unescape::new(chars).unescape() }