Skip to content

Commit 581933d

Browse files
authored
Support for character literals (#147)
* Support for character literals * Catch integer overflow for hex unicode literals
1 parent 9f12bb1 commit 581933d

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

basilisp/reader.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
whitespace_chars = re.compile('[\s,]')
2929
newline_chars = re.compile('(\r\n|\r|\n)')
3030
fn_macro_args = re.compile('(%)(&|[0-9])?')
31+
unicode_char = re.compile('u(\w+)')
3132

3233
GenSymEnvironment = Dict[str, symbol.Symbol]
3334
Resolver = Callable[[symbol.Symbol], symbol.Symbol]
@@ -719,6 +720,56 @@ def _read_deref(ctx: ReaderContext) -> LispForm:
719720
return llist.l(_DEREF, next_form)
720721

721722

723+
_SPECIAL_CHARS = {
724+
'newline': '\n',
725+
'space': ' ',
726+
'tab': '\t',
727+
'formfeed': '\f',
728+
'backspace': '\b',
729+
'return': '\r'
730+
}
731+
732+
733+
def _read_character(ctx: ReaderContext) -> str:
734+
"""Read a character literal from the input stream.
735+
736+
Character literals may appear as:
737+
- \\a \\b \\c etc will yield 'a', 'b', and 'c' respectively
738+
739+
- \\newline, \\space, \\tab, \\formfeed, \\backspace, \\return yield
740+
the named characters
741+
742+
- \\uXXXX yield the unicode digit corresponding to the code
743+
point named by the hex digits XXXX"""
744+
start = ctx.reader.advance()
745+
assert start == "\\"
746+
747+
s: List[str] = []
748+
reader = ctx.reader
749+
while True:
750+
token = reader.advance()
751+
if token == '' or whitespace_chars.match(token):
752+
break
753+
s.append(token)
754+
755+
char = ''.join(s)
756+
special = _SPECIAL_CHARS.get(char, None)
757+
if special is not None:
758+
return special
759+
760+
match = unicode_char.match(char)
761+
if match is not None:
762+
try:
763+
return chr(int(f"0x{match.group(1)}", 16))
764+
except (ValueError, OverflowError):
765+
raise SyntaxError(f"Unsupported character \\u{char}") from None
766+
767+
if len(char) > 1:
768+
raise SyntaxError(f"Unsupportred character \\{char}")
769+
770+
return char
771+
772+
722773
def _read_regex(ctx: ReaderContext) -> Pattern:
723774
"""Read a regex reader macro from the input stream."""
724775
s = _read_str(ctx)
@@ -823,6 +874,8 @@ def _read_next(ctx: ReaderContext) -> LispForm: # noqa: C901
823874
return _read_str(ctx)
824875
elif token == "'":
825876
return _read_quoted(ctx)
877+
elif token == '\\':
878+
return _read_character(ctx)
826879
elif ns_name_chars.match(token):
827880
return _read_sym(ctx)
828881
elif token == '#':

tests/reader_test.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,29 @@ def test_deref():
673673
assert read_str_first("@(atom {})") == llist.l(reader._DEREF, llist.l(sym.symbol('atom'), lmap.Map.empty()))
674674

675675

676+
def test_character_literal():
677+
assert "a" == read_str_first('\\a')
678+
assert "Ω" == read_str_first('\\Ω')
679+
680+
assert "Ω" == read_str_first('\\u03A9')
681+
682+
assert " " == read_str_first('\\space')
683+
assert "\n" == read_str_first('\\newline')
684+
assert "\t" == read_str_first('\\tab')
685+
assert "\b" == read_str_first('\\backspace')
686+
assert "\f" == read_str_first('\\formfeed')
687+
assert "\r" == read_str_first('\\return')
688+
689+
with pytest.raises(reader.SyntaxError):
690+
read_str_first('\\u03A9zzz')
691+
692+
with pytest.raises(reader.SyntaxError):
693+
read_str_first('\\uFFFFFFFF')
694+
695+
with pytest.raises(reader.SyntaxError):
696+
read_str_first('\\blah')
697+
698+
676699
def test_regex_reader_literal():
677700
assert read_str_first('#"hi"') == langutil.regex_from_str("hi")
678701
assert read_str_first('#"\s"') == langutil.regex_from_str(r"\s")

0 commit comments

Comments
 (0)