Skip to content

Commit 3ba8f21

Browse files
committed
fix!: invert STRING_ESCAPES_NOOP to blacklist mode
1 parent 65a784f commit 3ba8f21

File tree

6 files changed

+38
-27
lines changed

6 files changed

+38
-27
lines changed

sqlglot/dialects/mysql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,8 @@ class Tokenizer(tokens.Tokenizer):
202202
STRING_ESCAPES = ["'", '"', "\\"]
203203
BIT_STRINGS = [("b'", "'"), ("B'", "'"), ("0b", "")]
204204
HEX_STRINGS = [("x'", "'"), ("X'", "'"), ("0x", "")]
205-
STRING_ESCAPES_NOOP = ['"', "'"]
205+
# https://dev.mysql.com/doc/refman/8.4/en/string-literals.html
206+
ESCAPE_FOLLOW_CHARS = ["0", "b", "n", "r", "t", "Z", "%", "_"]
206207

207208
NESTED_COMMENTS = False
208209

sqlglot/tokens.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ def _quotes_to_format(
548548
}
549549

550550
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
551-
klass._STRING_ESCAPES_NOOP = set(klass.STRING_ESCAPES_NOOP)
551+
klass._ESCAPE_FOLLOW_CHARS = set(klass.ESCAPE_FOLLOW_CHARS)
552552
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
553553
klass._COMMENTS = {
554554
**dict(
@@ -600,7 +600,7 @@ def _quotes_to_format(
600600
tokens_preceding_hint={
601601
_TOKEN_TYPE_TO_INDEX[v] for v in klass.TOKENS_PRECEDING_HINT
602602
},
603-
string_escapes_noop=klass._STRING_ESCAPES_NOOP,
603+
escape_follow_chars=klass._ESCAPE_FOLLOW_CHARS,
604604
)
605605
token_types = RsTokenTypeSettings(
606606
bit_string=_TOKEN_TYPE_TO_INDEX[TokenType.BIT_STRING],
@@ -670,7 +670,7 @@ class Tokenizer(metaclass=_Tokenizer):
670670
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
671671
STRING_ESCAPES = ["'"]
672672
VAR_SINGLE_TOKENS: t.Set[str] = set()
673-
STRING_ESCAPES_NOOP: t.List[str] = []
673+
ESCAPE_FOLLOW_CHARS: t.List[str] = []
674674

675675
# The strings in this list can always be used as escapes, regardless of the surrounding
676676
# identifier delimiters. By default, the closing delimiter is assumed to also act as an
@@ -701,7 +701,7 @@ class Tokenizer(metaclass=_Tokenizer):
701701
_STRING_ESCAPES: t.Set[str] = set()
702702
_KEYWORD_TRIE: t.Dict = {}
703703
_RS_TOKENIZER: t.Optional[t.Any] = None
704-
_STRING_ESCAPES_NOOP: t.Set[str] = set()
704+
_ESCAPE_FOLLOW_CHARS: t.Set[str] = set()
705705

706706
KEYWORDS: t.Dict[str, TokenType] = {
707707
**{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
@@ -1514,19 +1514,21 @@ def _extract_string(
15141514
text += unescaped_sequence
15151515
continue
15161516

1517+
is_valid_custom_escape = (
1518+
self.ESCAPE_FOLLOW_CHARS
1519+
and self._char == "\\"
1520+
and self._peek not in self.ESCAPE_FOLLOW_CHARS
1521+
)
1522+
15171523
if (
15181524
(self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS or not raw_string)
15191525
and self._char in escapes
1520-
and (
1521-
self._peek == delimiter
1522-
or self._peek in escapes
1523-
or self._peek in self.STRING_ESCAPES_NOOP
1524-
)
1526+
and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
15251527
and (self._char not in self._QUOTES or self._char == self._peek)
15261528
):
15271529
if self._peek == delimiter:
15281530
text += self._peek
1529-
elif self._peek in self.STRING_ESCAPES_NOOP and self._char != self._peek:
1531+
elif is_valid_custom_escape and self._char != self._peek:
15301532
text += self._peek
15311533
else:
15321534
text += self._char + self._peek
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"white_space":{"\n":55,"\t":54,"\r":55," ":54},"single_tokens":{"\"":320,",":6,".":7,"[":2,"*":14,":":10,"]":3,"'":320,"(":0,")":1,"?":311,"-":8,"@":47,"$":46},"keywords":{"..":7},"numeric_literals":{},"identifiers":{"\"":"\""},"identifier_escapes":["\\"],"string_escapes":["\\"],"quotes":{"'":"'"},"format_strings":{"N'":["'",70],"n'":["'",70]},"has_bit_strings":false,"has_hex_strings":false,"comments":{"{#":"#}","--":null,"/*":"*/"},"var_single_tokens":[],"commands":[237,341,205,234,324],"command_prefix_tokens":[13,197],"tokens_preceding_hint":[261,334,221,361],"heredoc_tag_is_identifier":false,"string_escapes_allowed_in_raw_strings":true,"nested_comments":true,"hint_start":"/*+","string_escapes_noop":[]}
1+
{"white_space":{"\n":55,"\t":54,"\r":55," ":54},"single_tokens":{"\"":320,",":6,".":7,"[":2,"*":14,":":10,"]":3,"'":320,"(":0,")":1,"?":311,"-":8,"@":47,"$":46},"keywords":{"..":7},"numeric_literals":{},"identifiers":{"\"":"\""},"identifier_escapes":["\\"],"string_escapes":["\\"],"quotes":{"'":"'"},"format_strings":{"N'":["'",70],"n'":["'",70]},"has_bit_strings":false,"has_hex_strings":false,"comments":{"{#":"#}","--":null,"/*":"*/"},"var_single_tokens":[],"commands":[237,341,205,234,324],"command_prefix_tokens":[13,197],"tokens_preceding_hint":[261,334,221,361],"heredoc_tag_is_identifier":false,"string_escapes_allowed_in_raw_strings":true,"nested_comments":true,"hint_start":"/*+","escape_follow_chars":[]}

sqlglotrs/src/settings.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ pub struct TokenizerSettings {
104104
pub string_escapes_allowed_in_raw_strings: bool,
105105
pub nested_comments: bool,
106106
pub hint_start: String,
107-
pub string_escapes_noop: HashSet<char>,
107+
pub escape_follow_chars: HashSet<char>,
108108
}
109109

110110
#[pymethods]
@@ -131,7 +131,7 @@ impl TokenizerSettings {
131131
string_escapes_allowed_in_raw_strings: bool,
132132
nested_comments: bool,
133133
hint_start: String,
134-
string_escapes_noop: HashSet<String>,
134+
escape_follow_chars: HashSet<String>,
135135
) -> Self {
136136
let to_char = |v: &String| {
137137
if v.len() == 1 {
@@ -160,7 +160,7 @@ impl TokenizerSettings {
160160
identifier_escapes.iter().map(&to_char).collect();
161161

162162
let string_escapes_native: HashSet<char> = string_escapes.iter().map(&to_char).collect();
163-
let string_escapes_noop_native: HashSet<char> = string_escapes_noop.iter().map(&to_char).collect();
163+
let escape_follow_chars_native: HashSet<char> = escape_follow_chars.iter().map(&to_char).collect();
164164

165165
let var_single_tokens_native: HashSet<char> =
166166
var_single_tokens.iter().map(&to_char).collect();
@@ -186,7 +186,7 @@ impl TokenizerSettings {
186186
string_escapes_allowed_in_raw_strings,
187187
nested_comments,
188188
hint_start,
189-
string_escapes_noop: string_escapes_noop_native,
189+
escape_follow_chars: escape_follow_chars_native,
190190
};
191191

192192
#[cfg(feature = "profiling")]

sqlglotrs/src/tokenizer.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -665,14 +665,18 @@ impl<'a> TokenizerState<'a> {
665665
{
666666
let peek_char_str = self.peek_char.to_string();
667667
let equal_delimiter = delimiter == peek_char_str;
668-
668+
let is_valid_custom_escape =
669+
self.current_char == '\\'
670+
&& !self.settings.escape_follow_chars.is_empty()
671+
&& !self.settings.escape_follow_chars.contains(&self.peek_char);
672+
669673
if equal_delimiter
670674
|| escapes.contains(&self.peek_char)
671-
|| self.settings.string_escapes_noop.contains(&self.peek_char)
675+
|| is_valid_custom_escape
672676
{
673677
if equal_delimiter {
674678
text.push(self.peek_char);
675-
} else if self.settings.string_escapes_noop.contains(&self.peek_char)
679+
} else if is_valid_custom_escape
676680
&& self.current_char != self.peek_char
677681
{
678682
text.push(self.peek_char);

tests/dialects/test_mysql.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -432,17 +432,21 @@ def test_escape(self):
432432
},
433433
)
434434

435-
self.validate_all(
435+
self.validate_identity(
436436
r"'\"'",
437-
write={
438-
"mysql": """\'"\'""",
439-
},
437+
"""\'"\'""",
440438
)
441-
self.validate_all(
439+
self.validate_identity(
442440
"'\\\\\"a'",
443-
write={
444-
"mysql": "'\\\\\"a'",
445-
},
441+
"'\\\\\"a'",
442+
)
443+
self.validate_identity(
444+
"'\t'",
445+
"'\\t'",
446+
)
447+
self.validate_identity(
448+
"'\j'",
449+
"'j'",
446450
)
447451

448452
def test_introducers(self):

0 commit comments

Comments
 (0)