py: Implement partial PEP-498 (f-string) support

klardotsh · jepler · commit 3a7a5ba6860c · 2020-03-09T08:16:07.000-05:00
This implements (most of) the PEP-498 spec for f-strings, with two exceptions: - raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError` - one special corner case does not function as specified in the PEP (more on that in a moment) This is implemented in the core as a syntax translation, brute-forcing all f-strings to run through `String.format`. For example, the statement `x='world'; print(f'hello {x}')` gets translated *at a syntax level* (injected into the lexer) to `x='world'; print('hello {}'.format(x))`. While this may lead to weird column results in tracebacks, it seemed like the fastest, most efficient, and *likely* most RAM-friendly option, despite being implemented under the hood with a completely separate `vstr_t`. Since [string concatenation of adjacent literals is implemented in the lexer](micropython@534b7c3), two side effects emerge: - All strings with at least one f-string portion are concatenated into a single literal which *must* be run through `String.format()` wholesale, and: - Concatenation of a raw string with interpolation characters with an f-string will cause `IndexError`/`KeyError`, which is both different from CPython *and* different from the corner case mentioned in the PEP (which gave an example of the following:) ```python x = 10 y = 'hi' assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de' ``` The above-linked commit detailed a pretty solid case for leaving string concatenation in the lexer rather than putting it in the parser, and undoing that decision would likely be disproportionately costly on resources for the sake of a probably-low-impact corner case. An alternative to become complaint with this corner case of the PEP would be to revert to string concatenation in the parser *only when an f-string is part of concatenation*, though I've done no investigation on the difficulty or costs of doing this. A decent set of tests is included. I've manually tested this on the `unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and things seem sane.
diff --git a/py/lexer.c b/py/lexer.c
@@ -64,6 +64,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }
 
+STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
+    return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
+}
+
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
     return lex->chr1 == c;
 }
@@ -107,7 +111,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
 
 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
     return is_char_or(lex, '\'', '\"')
-        || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+        || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
+        || ((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
+            && is_char_following_following_or(lex, '\'', '\"'))
         || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
             && is_char_following_following_or(lex, '\'', '\"'));
 }
@@ -121,6 +127,37 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
     return is_head_of_identifier(lex) || is_digit(lex);
 }
 
+STATIC void swap_char_banks(mp_lexer_t *lex) {
+    if (lex->vstr_postfix_processing) {
+        unichar h0, h1, h2;
+
+        h0 = lex->chr0;
+        h1 = lex->chr1;
+        h2 = lex->chr2;
+
+        lex->chr0 = lex->vstr_postfix.len > 0 ? lex->vstr_postfix.buf[0] : 0;
+        lex->chr1 = lex->vstr_postfix.len > 1 ? lex->vstr_postfix.buf[1] : 0;
+        lex->chr2 = lex->vstr_postfix.len > 2 ? lex->vstr_postfix.buf[2] : 0;
+        lex->chr3 = h0;
+        lex->chr4 = h1;
+        lex->chr5 = h2;
+
+        lex->vstr_postfix_idx = lex->vstr_postfix.len > 2 ? 3 : lex->vstr_postfix.len;
+    } else {
+        // blindly reset to the "backup" bank when done postfix processing
+        // this restores control to the mp_reader
+        lex->chr0 = lex->chr3;
+        lex->chr1 = lex->chr4;
+        lex->chr2 = lex->chr5;
+        lex->chr3 = 0;
+        lex->chr4 = 0;
+        lex->chr5 = 0;
+
+        vstr_reset(&lex->vstr_postfix);
+        lex->vstr_postfix_idx = 0;
+    }
+}
+
 STATIC void next_char(mp_lexer_t *lex) {
     if (lex->chr0 == '\n') {
         // a new line
@@ -136,7 +173,16 @@ STATIC void next_char(mp_lexer_t *lex) {
 
     lex->chr0 = lex->chr1;
     lex->chr1 = lex->chr2;
-    lex->chr2 = lex->reader.readbyte(lex->reader.data);
+
+    if (lex->vstr_postfix_processing) {
+        if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
+            lex->chr2 = '\0';
+        } else {
+            lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
+        }
+    } else {
+        lex->chr2 = lex->reader.readbyte(lex->reader.data);
+    }
 
     if (lex->chr1 == '\r') {
         // CR is a new line, converted to LF
@@ -151,6 +197,11 @@ STATIC void next_char(mp_lexer_t *lex) {
     if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
         lex->chr2 = '\n';
     }
+
+    if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
+        lex->vstr_postfix_processing = false;
+        swap_char_banks(lex);
+    }
 }
 
 STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -270,7 +321,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
     // get first quoting character
     char quote_char = '\'';
     if (is_char(lex, '\"')) {
@@ -291,15 +342,67 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
     }
 
     size_t n_closing = 0;
+    bool in_expression = false;
+    bool expression_eat = true;
+
     while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
         if (is_char(lex, quote_char)) {
             n_closing += 1;
             vstr_add_char(&lex->vstr, CUR_CHAR(lex));
         } else {
             n_closing = 0;
+            if (is_fstring && is_char(lex, '{')) {
+                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+                in_expression = !in_expression;
+                expression_eat = in_expression;
+
+                if (lex->vstr_postfix.len == 0) {
+                    vstr_add_str(&lex->vstr_postfix, ".format(");
+                }
+
+                next_char(lex);
+                continue;
+            }
+
+            if (is_fstring && is_char(lex, '}')) {
+                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+
+                if (in_expression) {
+                    in_expression = false;
+                    vstr_add_char(&lex->vstr_postfix, ',');
+                }
+
+                next_char(lex);
+                continue;
+            }
+
+            if (in_expression) {
+                // throw errors for illegal chars inside f-string expressions
+                if (is_char(lex, '#')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_COMMENT;
+                    return;
+                } else if (is_char(lex, '\\')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_BACKSLASH;
+                    return;
+                } else if (is_char(lex, ':')) {
+                    expression_eat = false;
+                }
+
+                unichar c = CUR_CHAR(lex);
+                if (expression_eat) {
+                    vstr_add_char(&lex->vstr_postfix, c);
+                } else {
+                    vstr_add_char(&lex->vstr, c);
+                }
+
+                next_char(lex);
+                continue;
+            }
+
             if (is_char(lex, '\\')) {
                 next_char(lex);
                 unichar c = CUR_CHAR(lex);
+
                 if (is_raw) {
                     // raw strings allow escaping of quotes, but the backslash is also emitted
                     vstr_add_char(&lex->vstr, '\\');
@@ -430,6 +533,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
 }
 
 void mp_lexer_to_next(mp_lexer_t *lex) {
+    if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
+        // end format call injection
+        vstr_add_char(&lex->vstr_postfix, ')');
+        lex->vstr_postfix_processing = true;
+        swap_char_banks(lex);
+    }
+
     // start new token text
     vstr_reset(&lex->vstr);
 
@@ -485,6 +595,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         do {
             // parse type codes
             bool is_raw = false;
+            bool is_fstring = false;
             mp_token_kind_t kind = MP_TOKEN_STRING;
             int n_char = 0;
             if (is_char(lex, 'u')) {
@@ -503,6 +614,17 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
                     kind = MP_TOKEN_BYTES;
                     n_char = 2;
                 }
+                if (is_char_following(lex, 'f')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+            } else if (is_char(lex, 'f')) {
+                if (is_char_following(lex, 'r')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+                n_char = 1;
+                is_fstring = true;
             }
 
             // Set or check token kind
@@ -522,13 +644,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             }
 
             // Parse the literal
-            parse_string_literal(lex, is_raw);
+            parse_string_literal(lex, is_raw, is_fstring);
 
             // Skip whitespace so we can check if there's another string following
             skip_whitespace(lex, true);
 
         } while (is_string_or_bytes(lex));
-
     } else if (is_head_of_identifier(lex)) {
         lex->tok_kind = MP_TOKEN_NAME;
 
@@ -682,6 +803,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
     lex->num_indent_level = 1;
     lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
     vstr_init(&lex->vstr, 32);
+    vstr_init(&lex->vstr_postfix, 0);
 
     // store sentinel for first indentation level
     lex->indent_level[0] = 0;
diff --git a/py/lexer.h b/py/lexer.h
@@ -44,6 +44,12 @@ typedef enum _mp_token_kind_t {
     MP_TOKEN_INVALID,
     MP_TOKEN_DEDENT_MISMATCH,
     MP_TOKEN_LONELY_STRING_OPEN,
+    MP_TOKEN_FSTRING_BACKSLASH,
+    MP_TOKEN_FSTRING_COMMENT,
+    MP_TOKEN_FSTRING_UNCLOSED,
+    MP_TOKEN_FSTRING_UNOPENED,
+    MP_TOKEN_FSTRING_EMPTY_EXP,
+    MP_TOKEN_FSTRING_RAW,
 
     MP_TOKEN_NEWLINE,
     MP_TOKEN_INDENT,
@@ -150,6 +156,7 @@ typedef struct _mp_lexer_t {
     mp_reader_t reader;         // stream source
 
     unichar chr0, chr1, chr2;   // current cached characters from source
+    unichar chr3, chr4, chr5;   // current cached characters from alt source
 
     size_t line;                // current source line
     size_t column;              // current source column
@@ -165,6 +172,9 @@ typedef struct _mp_lexer_t {
     size_t tok_column;          // token source column
     mp_token_kind_t tok_kind;   // token kind
     vstr_t vstr;                // token data
+    vstr_t vstr_postfix;        // postfix to apply to string
+    bool vstr_postfix_processing;
+    uint16_t vstr_postfix_idx;
 } mp_lexer_t;
 
 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
diff --git a/py/parse.c b/py/parse.c
@@ -924,6 +924,7 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
                     backtrack = false;
                 }
                 for (; i < n; ++i) {
+                    //printf("--> inside for @L924\n");
                     uint16_t kind = rule_arg[i] & RULE_ARG_KIND_MASK;
                     if (kind == RULE_ARG_TOK) {
                         if (lex->tok_kind == (rule_arg[i] & RULE_ARG_ARG_MASK)) {
@@ -1168,15 +1169,43 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
         ) {
     syntax_error:;
         mp_obj_t exc;
-        if (lex->tok_kind == MP_TOKEN_INDENT) {
-            exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
-                translate("unexpected indent"));
-        } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
-            exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
-                translate("unindent does not match any outer indentation level"));
-        } else {
-            exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
-                translate("invalid syntax"));
+        switch(lex->tok_kind) {
+            case MP_TOKEN_INDENT:
+                exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
+                    translate("unexpected indent"));
+                break;
+            case MP_TOKEN_DEDENT_MISMATCH:
+                exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
+                    translate("unindent does not match any outer indentation level"));
+                break;
+            case MP_TOKEN_FSTRING_BACKSLASH:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("f-string expression part cannot include a backslash"));
+                break;
+            case MP_TOKEN_FSTRING_COMMENT:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("f-string expression part cannot include a '#'"));
+                break;
+            case MP_TOKEN_FSTRING_UNCLOSED:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("f-string: expecting '}'"));
+                break;
+            case MP_TOKEN_FSTRING_UNOPENED:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("f-string: single '}' is not allowed"));
+                break;
+            case MP_TOKEN_FSTRING_EMPTY_EXP:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("f-string: empty expression not allowed"));
+                break;
+            case MP_TOKEN_FSTRING_RAW:
+                exc = mp_obj_new_exception_msg(&mp_type_NotImplementedError,
+                    translate("raw f-strings are not implemented"));
+                break;
+            default:
+                exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                    translate("invalid syntax"));
+                break;
         }
         // add traceback to give info about file name and location
         // we don't have a 'block' name, so just pass the NULL qstr to indicate this
diff --git a/tests/basics/string_pep498_fstring.py b/tests/basics/string_pep498_fstring.py