Skip to content

Commit f483458

Browse files
authored
Read escape sequences in string literals; read char literals inside other forms (#192)
* Character literals can now be read in other forms; strings can contain escape sequences * Disallow arbitrary escape sequences in non-regex string literals
1 parent 2b1936c commit f483458

File tree

2 files changed

+64
-16
lines changed

2 files changed

+64
-16
lines changed

basilisp/reader.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from basilisp.util import Maybe
2626

2727
ns_name_chars = re.compile(r'\w|-|\+|\*|\?|/|\=|\\|!|&|%|>|<')
28+
alphanumeric_chars = re.compile(r'\w')
2829
begin_num_chars = re.compile(r'[0-9\-]')
2930
num_chars = re.compile('[0-9]')
3031
whitespace_chars = re.compile(r'[\s,]')
@@ -534,23 +535,41 @@ def _read_num(ctx: ReaderContext) -> MaybeNumber: # noqa: C901 # pylint: disab
534535
return int(s)
535536

536537

537-
def _read_str(ctx: ReaderContext) -> str:
538-
"""Return a string from the input stream."""
538+
_STR_ESCAPE_CHARS = {
539+
'"': '"',
540+
'\\': '\\',
541+
'a': '\a',
542+
'b': '\b',
543+
'f': '\f',
544+
'n': '\n',
545+
'r': '\r',
546+
't': '\t',
547+
'v': '\v'
548+
}
549+
550+
551+
def _read_str(ctx: ReaderContext, allow_arbitrary_escapes: bool = False) -> str:
552+
"""Return a string from the input stream.
553+
554+
If allow_arbitrary_escapes is True, do not throw a SyntaxError if an
555+
unknown escape sequence is encountered."""
539556
s: List[str] = []
540557
reader = ctx.reader
541558
while True:
542-
prev = reader.peek()
543559
token = reader.next_token()
544560
if token == '':
545561
raise SyntaxError("Unexpected EOF in string")
546562
if token == "\\":
547563
token = reader.next_token()
548-
if token == '"':
549-
s.append('"')
564+
escape_char = _STR_ESCAPE_CHARS.get(token, None)
565+
if escape_char:
566+
s.append(escape_char)
550567
continue
551-
else:
568+
if allow_arbitrary_escapes:
552569
s.append("\\")
553-
if token == '"' and not prev == "\\":
570+
else:
571+
raise SyntaxError("Unknown escape sequence: \\{token}")
572+
if token == '"':
554573
reader.next_token()
555574
return ''.join(s)
556575
s.append(token)
@@ -833,11 +852,14 @@ def _read_character(ctx: ReaderContext) -> str:
833852

834853
s: List[str] = []
835854
reader = ctx.reader
855+
token = reader.peek()
836856
while True:
837-
token = reader.advance()
838857
if token == '' or whitespace_chars.match(token):
839858
break
859+
if not alphanumeric_chars.match(token):
860+
break
840861
s.append(token)
862+
token = reader.next_token()
841863

842864
char = ''.join(s)
843865
special = _SPECIAL_CHARS.get(char, None)
@@ -852,14 +874,14 @@ def _read_character(ctx: ReaderContext) -> str:
852874
raise SyntaxError(f"Unsupported character \\u{char}") from None
853875

854876
if len(char) > 1:
855-
raise SyntaxError(f"Unsupportred character \\{char}")
877+
raise SyntaxError(f"Unsupported character \\{char}")
856878

857879
return char
858880

859881

860882
def _read_regex(ctx: ReaderContext) -> Pattern:
861883
"""Read a regex reader macro from the input stream."""
862-
s = _read_str(ctx)
884+
s = _read_str(ctx, allow_arbitrary_escapes=True)
863885
try:
864886
return langutil.regex_from_str(s)
865887
except re.error:

tests/reader_test.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,28 @@ def test_symbol():
291291

292292

293293
def test_str():
294-
assert read_str_first('""') == ''
295-
assert read_str_first('"Regular string"') == 'Regular string'
296-
assert read_str_first(
297-
'"String with \'inner string\'"') == "String with 'inner string'"
298-
assert read_str_first(
299-
r'"String with \"inner string\""') == 'String with "inner string"'
294+
assert '' == read_str_first('""')
295+
296+
assert "\"" == read_str_first(r'"\""')
297+
assert "\\" == read_str_first(r'"\\"')
298+
assert "\a" == read_str_first(r'"\a"')
299+
assert "\b" == read_str_first(r'"\b"')
300+
assert "\f" == read_str_first(r'"\f"')
301+
assert "\n" == read_str_first(r'"\n"')
302+
assert "\r" == read_str_first(r'"\r"')
303+
assert "\t" == read_str_first(r'"\t"')
304+
assert "\v" == read_str_first(r'"\v"')
305+
306+
with pytest.raises(reader.SyntaxError):
307+
read_str_first(r'"\q"')
308+
309+
assert "Hello,\nmy name is\tChris." == read_str_first(r'"Hello,\nmy name is\tChris."')
310+
311+
assert 'Regular string' == read_str_first('"Regular string"')
312+
assert "String with 'inner string'" == read_str_first(
313+
'"String with \'inner string\'"')
314+
assert 'String with "inner string"' == read_str_first(
315+
r'"String with \"inner string\""')
300316

301317
with pytest.raises(reader.SyntaxError):
302318
read_str_first('"Start of a string')
@@ -728,6 +744,16 @@ def test_character_literal():
728744
assert "\f" == read_str_first('\\formfeed')
729745
assert "\r" == read_str_first('\\return')
730746

747+
assert vec.v("a") == read_str_first('[\\a]')
748+
assert vec.v("Ω") == read_str_first('[\\Ω]')
749+
750+
assert llist.l(sym.symbol("str"), "Ω") == read_str_first('(str \\u03A9)')
751+
752+
assert vec.v(" ") == read_str_first('[\\space]')
753+
assert vec.v("\n") == read_str_first('[\\newline]')
754+
assert vec.v("\t") == read_str_first('[\\tab]')
755+
assert llist.l(sym.symbol("str"), "\b", "\f", "\r") == read_str_first('(str \\backspace \\formfeed \\return)')
756+
731757
with pytest.raises(reader.SyntaxError):
732758
read_str_first('\\u03A9zzz')
733759

0 commit comments

Comments
 (0)