Skip to content

Commit 2752c55

Browse files
committed
Fix: handle unicode heredoc tags & Rust grapheme clusters properly
1 parent 27a76cd commit 2752c55

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

sqlglot/tokens.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1421,7 +1421,11 @@ def _scan_string(self, start: str) -> bool:
14211421
raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
14221422
)
14231423

1424-
if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()):
1424+
if (
1425+
tag
1426+
and self.HEREDOC_TAG_IS_IDENTIFIER
1427+
and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1428+
):
14251429
if not self._end:
14261430
self._advance(-1)
14271431

sqlglotrs/src/tokenizer.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ impl<'a> TokenizerState<'a> {
450450
self.advance(-1)?;
451451
}
452452

453-
self.advance(-(tag.len() as isize))?;
453+
self.advance(-(tag.chars().count() as isize))?;
454454
self.add(self.token_types.heredoc_string_alternative, None)?;
455455
return Ok(true);
456456
}
@@ -687,9 +687,10 @@ impl<'a> TokenizerState<'a> {
687687
continue;
688688
}
689689
}
690-
if self.chars(delimiter.len()) == delimiter {
691-
if delimiter.len() > 1 {
692-
self.advance((delimiter.len() - 1) as isize)?;
690+
let delimiter_char_count = delimiter.chars().count();
691+
if self.chars(delimiter_char_count) == delimiter {
692+
if delimiter_char_count > 1 {
693+
self.advance((delimiter_char_count - 1) as isize)?;
693694
}
694695
break;
695696
}
@@ -723,7 +724,7 @@ impl<'a> TokenizerState<'a> {
723724
fn is_identifier(&self, s: &str) -> bool {
724725
s.chars().enumerate().all(|(i, c)| {
725726
if i == 0 {
726-
self.is_alphabetic_or_underscore(c)
727+
! (c.is_ascii_digit() || c.is_whitespace())
727728
} else {
728729
self.is_alphabetic_or_underscore(c) || c.is_ascii_digit()
729730
}

tests/dialects/test_duckdb.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,10 @@ def test_duckdb(self):
363363
self.validate_identity(
364364
"""SELECT '{ "family": "anatidae", "species": [ "duck", "goose", "swan", null ] }' ->> ['$.family', '$.species']""",
365365
)
366+
self.validate_identity(
367+
"SELECT $🦆$foo$🦆$",
368+
"SELECT 'foo'",
369+
)
366370
self.validate_identity(
367371
"SELECT * FROM t PIVOT(FIRST(t) AS t, FOR quarter IN ('Q1', 'Q2'))",
368372
"SELECT * FROM t PIVOT(FIRST(t) AS t FOR quarter IN ('Q1', 'Q2'))",

0 commit comments

Comments
 (0)