Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
with:
crate: cargo-tarpaulin
version: 0.14.2
use-tool-cache: true
use-tool-cache: false
- name: Test
run: cargo test --all-features

Expand All @@ -83,7 +83,7 @@ jobs:
with:
crate: cargo-tarpaulin
version: 0.14.2
use-tool-cache: true
use-tool-cache: false
- name: Coverage
run: cargo tarpaulin -o Lcov --output-dir ./coverage
- name: Coveralls
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Patching SQL Parser for LakeSail

1. Use `dev` as the base branch when creating PRs in the fork.
2. Please confirm the base repository when creating PRs. You should manually choose `lakehq/sqlparser-rs` when proposing changes to the fork.
3. For patching, use a squash commit to merge the PR. This ensures that each patch appears as a single commit in the `dev` branch of the fork.
4. For merging from upstream, use a merge commit to merge the PR. This ensures that the upstream history is kept in the `dev` branch of the fork.
5. Please avoid mixing code changes and upstream merge in a single PR.

# Extensible SQL Lexer and Parser for Rust

[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
Expand Down
69 changes: 64 additions & 5 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,9 @@ impl<'a> Tokenizer<'a> {
// BigQuery uses b or B for byte string literal
b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
match peeking_skip_whitespace_take_if(chars, |ch| {
matches!(ch, '\'') || matches!(ch, '\"')
}) {
Some('\'') => {
if self.dialect.supports_triple_quoted_string() {
return self
Expand Down Expand Up @@ -745,7 +747,9 @@ impl<'a> Tokenizer<'a> {
// BigQuery uses r or R for raw string literal
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
match peeking_skip_whitespace_take_if(chars, |ch| {
matches!(ch, '\'') || matches!(ch, '\"')
}) {
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
Expand All @@ -772,12 +776,19 @@ impl<'a> Tokenizer<'a> {
// Redshift uses lower case n for national string literal
n @ 'N' | n @ 'n' => {
chars.next(); // consume, to check the next char
match chars.peek() {
match peeking_skip_whitespace_take_if(chars, |ch| {
matches!(ch, '\'') || matches!(ch, '\"')
}) {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
Some('\"') => {
// N"..." - a <national character string literal>
let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n, chars);
Expand All @@ -789,7 +800,7 @@ impl<'a> Tokenizer<'a> {
x @ 'e' | x @ 'E' => {
let starting_loc = chars.location();
chars.next(); // consume, to check the next char
match chars.peek() {
match peeking_skip_whitespace_take_if(chars, |ch| matches!(ch, '\'')) {
Some('\'') => {
let s =
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
Expand Down Expand Up @@ -823,12 +834,19 @@ impl<'a> Tokenizer<'a> {
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
x @ 'x' | x @ 'X' => {
chars.next(); // consume, to check the next char
match chars.peek() {
match peeking_skip_whitespace_take_if(chars, |ch| {
matches!(ch, '\'') || matches!(ch, '\"')
}) {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
Some('\"') => {
// X"..." - a <binary string literal>
let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x, chars);
Expand Down Expand Up @@ -1674,6 +1692,47 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
s
}

/// Peek ahead in a clone of `self.peekable`, skipping whitespace,
/// until `predicate` returns `true` or a non-whitespace character is encountered.
/// If a character matching the predicate is found:
/// - Advance the original iterator by the number of whitespace characters skipped
/// - Return the peeked character matching the predicate
///
/// If a non-whitespace character not matching the predicate is encountered, or EOF is reached,
/// return `self.peek()` without advancing the iterator.
///
/// Note: This function may advance the original iterator if a match is found after skipping whitespace.
fn peeking_skip_whitespace_take_if(
chars: &mut State,
mut predicate: impl FnMut(char) -> bool,
) -> Option<char> {
// Check if the next character is a match to avoid unnecessary cloning.
if let Some(&ch) = chars.peek() {
if predicate(ch) {
return Some(ch);
}
}

let mut chars_clone = chars.peekable.clone();
let mut next_count = 0;
loop {
match chars_clone.peek() {
Some(&ch) if predicate(ch) => {
// Advance the original iterator
for _ in 0..next_count {
chars.next();
}
return chars.peek().copied();
}
Some(ch) if ch.is_whitespace() || matches!(ch, ' ' | '\t' | '\n' | '\r') => {
next_count += 1;
chars_clone.next();
}
_ => return chars.peek().copied(),
}
}
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Unescape::new(chars).unescape()
}
Expand Down
Loading