Skip to content

Commit 3a7a5ba

Browse files
klardotshjepler
authored andcommitted
py: Implement partial PEP-498 (f-string) support
This implements (most of) the PEP-498 spec for f-strings, with two exceptions: - raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError` - one special corner case does not function as specified in the PEP (more on that in a moment) This is implemented in the core as a syntax translation, brute-forcing all f-strings to run through `String.format`. For example, the statement `x='world'; print(f'hello {x}')` gets translated *at a syntax level* (injected into the lexer) to `x='world'; print('hello {}'.format(x))`. While this may lead to weird column results in tracebacks, it seemed like the fastest, most efficient, and *likely* most RAM-friendly option, despite being implemented under the hood with a completely separate `vstr_t`. Since [string concatenation of adjacent literals is implemented in the lexer](micropython@534b7c3), two side effects emerge: - All strings with at least one f-string portion are concatenated into a single literal which *must* be run through `String.format()` wholesale, and: - Concatenation of a raw string with interpolation characters with an f-string will cause `IndexError`/`KeyError`, which is both different from CPython *and* different from the corner case mentioned in the PEP (which gave an example of the following:) ```python x = 10 y = 'hi' assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de' ``` The above-linked commit detailed a pretty solid case for leaving string concatenation in the lexer rather than putting it in the parser, and undoing that decision would likely be disproportionately costly on resources for the sake of a probably-low-impact corner case. An alternative to become complaint with this corner case of the PEP would be to revert to string concatenation in the parser *only when an f-string is part of concatenation*, though I've done no investigation on the difficulty or costs of doing this. A decent set of tests is included. I've manually tested this on the `unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and things seem sane.
1 parent 83d5da9 commit 3a7a5ba

File tree

4 files changed

+301
-14
lines changed

4 files changed

+301
-14
lines changed

py/lexer.c

Lines changed: 127 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
6464
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
6565
}
6666

67+
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
68+
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
69+
}
70+
6771
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
6872
return lex->chr1 == c;
6973
}
@@ -107,7 +111,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
107111

108112
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
109113
return is_char_or(lex, '\'', '\"')
110-
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
114+
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
115+
|| ((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
116+
&& is_char_following_following_or(lex, '\'', '\"'))
111117
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
112118
&& is_char_following_following_or(lex, '\'', '\"'));
113119
}
@@ -121,6 +127,37 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
121127
return is_head_of_identifier(lex) || is_digit(lex);
122128
}
123129

130+
STATIC void swap_char_banks(mp_lexer_t *lex) {
131+
if (lex->vstr_postfix_processing) {
132+
unichar h0, h1, h2;
133+
134+
h0 = lex->chr0;
135+
h1 = lex->chr1;
136+
h2 = lex->chr2;
137+
138+
lex->chr0 = lex->vstr_postfix.len > 0 ? lex->vstr_postfix.buf[0] : 0;
139+
lex->chr1 = lex->vstr_postfix.len > 1 ? lex->vstr_postfix.buf[1] : 0;
140+
lex->chr2 = lex->vstr_postfix.len > 2 ? lex->vstr_postfix.buf[2] : 0;
141+
lex->chr3 = h0;
142+
lex->chr4 = h1;
143+
lex->chr5 = h2;
144+
145+
lex->vstr_postfix_idx = lex->vstr_postfix.len > 2 ? 3 : lex->vstr_postfix.len;
146+
} else {
147+
// blindly reset to the "backup" bank when done postfix processing
148+
// this restores control to the mp_reader
149+
lex->chr0 = lex->chr3;
150+
lex->chr1 = lex->chr4;
151+
lex->chr2 = lex->chr5;
152+
lex->chr3 = 0;
153+
lex->chr4 = 0;
154+
lex->chr5 = 0;
155+
156+
vstr_reset(&lex->vstr_postfix);
157+
lex->vstr_postfix_idx = 0;
158+
}
159+
}
160+
124161
STATIC void next_char(mp_lexer_t *lex) {
125162
if (lex->chr0 == '\n') {
126163
// a new line
@@ -136,7 +173,16 @@ STATIC void next_char(mp_lexer_t *lex) {
136173

137174
lex->chr0 = lex->chr1;
138175
lex->chr1 = lex->chr2;
139-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
176+
177+
if (lex->vstr_postfix_processing) {
178+
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
179+
lex->chr2 = '\0';
180+
} else {
181+
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
182+
}
183+
} else {
184+
lex->chr2 = lex->reader.readbyte(lex->reader.data);
185+
}
140186

141187
if (lex->chr1 == '\r') {
142188
// CR is a new line, converted to LF
@@ -151,6 +197,11 @@ STATIC void next_char(mp_lexer_t *lex) {
151197
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
152198
lex->chr2 = '\n';
153199
}
200+
201+
if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
202+
lex->vstr_postfix_processing = false;
203+
swap_char_banks(lex);
204+
}
154205
}
155206

156207
STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -270,7 +321,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
270321
return true;
271322
}
272323

273-
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
324+
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
274325
// get first quoting character
275326
char quote_char = '\'';
276327
if (is_char(lex, '\"')) {
@@ -291,15 +342,67 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
291342
}
292343

293344
size_t n_closing = 0;
345+
bool in_expression = false;
346+
bool expression_eat = true;
347+
294348
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
295349
if (is_char(lex, quote_char)) {
296350
n_closing += 1;
297351
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
298352
} else {
299353
n_closing = 0;
354+
if (is_fstring && is_char(lex, '{')) {
355+
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
356+
in_expression = !in_expression;
357+
expression_eat = in_expression;
358+
359+
if (lex->vstr_postfix.len == 0) {
360+
vstr_add_str(&lex->vstr_postfix, ".format(");
361+
}
362+
363+
next_char(lex);
364+
continue;
365+
}
366+
367+
if (is_fstring && is_char(lex, '}')) {
368+
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
369+
370+
if (in_expression) {
371+
in_expression = false;
372+
vstr_add_char(&lex->vstr_postfix, ',');
373+
}
374+
375+
next_char(lex);
376+
continue;
377+
}
378+
379+
if (in_expression) {
380+
// throw errors for illegal chars inside f-string expressions
381+
if (is_char(lex, '#')) {
382+
lex->tok_kind = MP_TOKEN_FSTRING_COMMENT;
383+
return;
384+
} else if (is_char(lex, '\\')) {
385+
lex->tok_kind = MP_TOKEN_FSTRING_BACKSLASH;
386+
return;
387+
} else if (is_char(lex, ':')) {
388+
expression_eat = false;
389+
}
390+
391+
unichar c = CUR_CHAR(lex);
392+
if (expression_eat) {
393+
vstr_add_char(&lex->vstr_postfix, c);
394+
} else {
395+
vstr_add_char(&lex->vstr, c);
396+
}
397+
398+
next_char(lex);
399+
continue;
400+
}
401+
300402
if (is_char(lex, '\\')) {
301403
next_char(lex);
302404
unichar c = CUR_CHAR(lex);
405+
303406
if (is_raw) {
304407
// raw strings allow escaping of quotes, but the backslash is also emitted
305408
vstr_add_char(&lex->vstr, '\\');
@@ -430,6 +533,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
430533
}
431534

432535
void mp_lexer_to_next(mp_lexer_t *lex) {
536+
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
537+
// end format call injection
538+
vstr_add_char(&lex->vstr_postfix, ')');
539+
lex->vstr_postfix_processing = true;
540+
swap_char_banks(lex);
541+
}
542+
433543
// start new token text
434544
vstr_reset(&lex->vstr);
435545

@@ -485,6 +595,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
485595
do {
486596
// parse type codes
487597
bool is_raw = false;
598+
bool is_fstring = false;
488599
mp_token_kind_t kind = MP_TOKEN_STRING;
489600
int n_char = 0;
490601
if (is_char(lex, 'u')) {
@@ -503,6 +614,17 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
503614
kind = MP_TOKEN_BYTES;
504615
n_char = 2;
505616
}
617+
if (is_char_following(lex, 'f')) {
618+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
619+
break;
620+
}
621+
} else if (is_char(lex, 'f')) {
622+
if (is_char_following(lex, 'r')) {
623+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
624+
break;
625+
}
626+
n_char = 1;
627+
is_fstring = true;
506628
}
507629

508630
// Set or check token kind
@@ -522,13 +644,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
522644
}
523645

524646
// Parse the literal
525-
parse_string_literal(lex, is_raw);
647+
parse_string_literal(lex, is_raw, is_fstring);
526648

527649
// Skip whitespace so we can check if there's another string following
528650
skip_whitespace(lex, true);
529651

530652
} while (is_string_or_bytes(lex));
531-
532653
} else if (is_head_of_identifier(lex)) {
533654
lex->tok_kind = MP_TOKEN_NAME;
534655

@@ -682,6 +803,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
682803
lex->num_indent_level = 1;
683804
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
684805
vstr_init(&lex->vstr, 32);
806+
vstr_init(&lex->vstr_postfix, 0);
685807

686808
// store sentinel for first indentation level
687809
lex->indent_level[0] = 0;

py/lexer.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ typedef enum _mp_token_kind_t {
4444
MP_TOKEN_INVALID,
4545
MP_TOKEN_DEDENT_MISMATCH,
4646
MP_TOKEN_LONELY_STRING_OPEN,
47+
MP_TOKEN_FSTRING_BACKSLASH,
48+
MP_TOKEN_FSTRING_COMMENT,
49+
MP_TOKEN_FSTRING_UNCLOSED,
50+
MP_TOKEN_FSTRING_UNOPENED,
51+
MP_TOKEN_FSTRING_EMPTY_EXP,
52+
MP_TOKEN_FSTRING_RAW,
4753

4854
MP_TOKEN_NEWLINE,
4955
MP_TOKEN_INDENT,
@@ -150,6 +156,7 @@ typedef struct _mp_lexer_t {
150156
mp_reader_t reader; // stream source
151157

152158
unichar chr0, chr1, chr2; // current cached characters from source
159+
unichar chr3, chr4, chr5; // current cached characters from alt source
153160

154161
size_t line; // current source line
155162
size_t column; // current source column
@@ -165,6 +172,9 @@ typedef struct _mp_lexer_t {
165172
size_t tok_column; // token source column
166173
mp_token_kind_t tok_kind; // token kind
167174
vstr_t vstr; // token data
175+
vstr_t vstr_postfix; // postfix to apply to string
176+
bool vstr_postfix_processing;
177+
uint16_t vstr_postfix_idx;
168178
} mp_lexer_t;
169179

170180
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);

py/parse.c

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,7 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
924924
backtrack = false;
925925
}
926926
for (; i < n; ++i) {
927+
//printf("--> inside for @L924\n");
927928
uint16_t kind = rule_arg[i] & RULE_ARG_KIND_MASK;
928929
if (kind == RULE_ARG_TOK) {
929930
if (lex->tok_kind == (rule_arg[i] & RULE_ARG_ARG_MASK)) {
@@ -1168,15 +1169,43 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
11681169
) {
11691170
syntax_error:;
11701171
mp_obj_t exc;
1171-
if (lex->tok_kind == MP_TOKEN_INDENT) {
1172-
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
1173-
translate("unexpected indent"));
1174-
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
1175-
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
1176-
translate("unindent does not match any outer indentation level"));
1177-
} else {
1178-
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1179-
translate("invalid syntax"));
1172+
switch(lex->tok_kind) {
1173+
case MP_TOKEN_INDENT:
1174+
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
1175+
translate("unexpected indent"));
1176+
break;
1177+
case MP_TOKEN_DEDENT_MISMATCH:
1178+
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
1179+
translate("unindent does not match any outer indentation level"));
1180+
break;
1181+
case MP_TOKEN_FSTRING_BACKSLASH:
1182+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1183+
translate("f-string expression part cannot include a backslash"));
1184+
break;
1185+
case MP_TOKEN_FSTRING_COMMENT:
1186+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1187+
translate("f-string expression part cannot include a '#'"));
1188+
break;
1189+
case MP_TOKEN_FSTRING_UNCLOSED:
1190+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1191+
translate("f-string: expecting '}'"));
1192+
break;
1193+
case MP_TOKEN_FSTRING_UNOPENED:
1194+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1195+
translate("f-string: single '}' is not allowed"));
1196+
break;
1197+
case MP_TOKEN_FSTRING_EMPTY_EXP:
1198+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1199+
translate("f-string: empty expression not allowed"));
1200+
break;
1201+
case MP_TOKEN_FSTRING_RAW:
1202+
exc = mp_obj_new_exception_msg(&mp_type_NotImplementedError,
1203+
translate("raw f-strings are not implemented"));
1204+
break;
1205+
default:
1206+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1207+
translate("invalid syntax"));
1208+
break;
11801209
}
11811210
// add traceback to give info about file name and location
11821211
// we don't have a 'block' name, so just pass the NULL qstr to indicate this

0 commit comments

Comments
 (0)