Support for character literals (#147)

chrisrink10 · web-flow · commit 581933d81e5a · 2018-09-05T11:05:09.000-04:00
* Support for character literals

* Catch integer overflow for hex unicode literals
diff --git a/basilisp/reader.py b/basilisp/reader.py
@@ -28,6 +28,7 @@
 whitespace_chars = re.compile('[\s,]')
 newline_chars = re.compile('(\r\n|\r|\n)')
 fn_macro_args = re.compile('(%)(&|[0-9])?')
+unicode_char = re.compile('u(\w+)')
 
 GenSymEnvironment = Dict[str, symbol.Symbol]
 Resolver = Callable[[symbol.Symbol], symbol.Symbol]
@@ -719,6 +720,56 @@ def _read_deref(ctx: ReaderContext) -> LispForm:
     return llist.l(_DEREF, next_form)
 
 
+_SPECIAL_CHARS = {
+    'newline': '\n',
+    'space': ' ',
+    'tab': '\t',
+    'formfeed': '\f',
+    'backspace': '\b',
+    'return': '\r'
+}
+
+
+def _read_character(ctx: ReaderContext) -> str:
+    """Read a character literal from the input stream.
+
+    Character literals may appear as:
+      - \\a \\b \\c etc will yield 'a', 'b', and 'c' respectively
+
+      - \\newline, \\space, \\tab, \\formfeed, \\backspace, \\return yield
+        the named characters
+
+      - \\uXXXX yield the unicode digit corresponding to the code
+        point named by the hex digits XXXX"""
+    start = ctx.reader.advance()
+    assert start == "\\"
+
+    s: List[str] = []
+    reader = ctx.reader
+    while True:
+        token = reader.advance()
+        if token == '' or whitespace_chars.match(token):
+            break
+        s.append(token)
+
+    char = ''.join(s)
+    special = _SPECIAL_CHARS.get(char, None)
+    if special is not None:
+        return special
+
+    match = unicode_char.match(char)
+    if match is not None:
+        try:
+            return chr(int(f"0x{match.group(1)}", 16))
+        except (ValueError, OverflowError):
+            raise SyntaxError(f"Unsupported character \\u{char}") from None
+
+    if len(char) > 1:
+        raise SyntaxError(f"Unsupportred character \\{char}")
+
+    return char
+
+
 def _read_regex(ctx: ReaderContext) -> Pattern:
     """Read a regex reader macro from the input stream."""
     s = _read_str(ctx)
@@ -823,6 +874,8 @@ def _read_next(ctx: ReaderContext) -> LispForm:  # noqa: C901
         return _read_str(ctx)
     elif token == "'":
         return _read_quoted(ctx)
+    elif token == '\\':
+        return _read_character(ctx)
     elif ns_name_chars.match(token):
         return _read_sym(ctx)
     elif token == '#':
diff --git a/tests/reader_test.py b/tests/reader_test.py
@@ -673,6 +673,29 @@ def test_deref():
     assert read_str_first("@(atom {})") == llist.l(reader._DEREF, llist.l(sym.symbol('atom'), lmap.Map.empty()))
 
 
+def test_character_literal():
+    assert "a" == read_str_first('\\a')
+    assert "Ω" == read_str_first('\\Ω')
+
+    assert "Ω" == read_str_first('\\u03A9')
+
+    assert " " == read_str_first('\\space')
+    assert "\n" == read_str_first('\\newline')
+    assert "\t" == read_str_first('\\tab')
+    assert "\b" == read_str_first('\\backspace')
+    assert "\f" == read_str_first('\\formfeed')
+    assert "\r" == read_str_first('\\return')
+
+    with pytest.raises(reader.SyntaxError):
+        read_str_first('\\u03A9zzz')
+
+    with pytest.raises(reader.SyntaxError):
+        read_str_first('\\uFFFFFFFF')
+
+    with pytest.raises(reader.SyntaxError):
+        read_str_first('\\blah')
+
+
 def test_regex_reader_literal():
     assert read_str_first('#"hi"') == langutil.regex_from_str("hi")
     assert read_str_first('#"\s"') == langutil.regex_from_str(r"\s")