Skip to content

Commit 4d1a440

Browse files
committed
first implementation of d-string
1 parent 61fc72a commit 4d1a440

File tree

5 files changed

+373
-33
lines changed

5 files changed

+373
-33
lines changed

Lib/test/test_dstring.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import unittest
2+
3+
4+
class DStringTestCase(unittest.TestCase):
5+
def assertAllRaise(self, exception_type, regex, error_strings):
6+
for str in error_strings:
7+
with self.subTest(str=str):
8+
with self.assertRaisesRegex(exception_type, regex) as cm:
9+
eval(str)
10+
# print("Testing expression:", repr(str))
11+
# print(repr(cm.exception))
12+
# print(repr(cm.exception.text))
13+
14+
def test_single_quote(self):
15+
exprs = [
16+
"d'hello'",
17+
'D"hello"',
18+
"d'hello\\nworld'",
19+
]
20+
self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)
21+
22+
def test_empty_dstring(self):
23+
exprs = [
24+
"d''''''",
25+
'D""""""',
26+
]
27+
self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
28+
29+
def test_no_last_newline(self):
30+
exprs = [
31+
"d'''\nhello world'''",
32+
'D"""\nhello world"""',
33+
"df'''\nhello {42}'''",
34+
]
35+
self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
36+
37+
def test_simple_dstring(self):
38+
self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n")
39+
self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n")
40+
self.assertEqual(eval('d"""\n hello world\n"""'), " hello world\n")
41+
self.assertEqual(eval('d"""\n hello world\\\n """'), " hello world")
42+
self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n")
43+
44+
45+
46+
if __name__ == '__main__':
47+
unittest.main()

Lib/tokenize.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def _all_string_prefixes():
8686
# The valid string prefixes. Only contain the lower case versions,
8787
# and don't contain any permutations (include 'fr', but not
8888
# 'rf'). The various permutations will be generated.
89-
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
89+
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr',
90+
'df', 'dt', 'dr', 'dfr', 'dtr']
9091
# if we add binary f-strings, add: ['fb', 'fbr']
9192
result = {''}
9293
for prefix in _valid_string_prefixes:

Parser/action_helpers.c

Lines changed: 171 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,24 +1292,124 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq
12921292

12931293
// Fstring stuff
12941294

1295+
static int
1296+
unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
1297+
int is_raw, Token* token)
1298+
{
1299+
if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
1300+
return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
1301+
}
1302+
else {
1303+
PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
1304+
if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
1305+
Py_XDECREF(line);
1306+
return -1;
1307+
}
1308+
Py_DECREF(line);
1309+
}
1310+
return 0;
1311+
}
1312+
1313+
static PyObject*
1314+
_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
1315+
int is_raw, int is_first, expr_ty constant, Token* token)
1316+
{
1317+
Py_ssize_t lineno = constant->lineno;
1318+
const char *line_start = s;
1319+
const char *s_end = s + len;
1320+
1321+
PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
1322+
if (w == NULL) {
1323+
return NULL;
1324+
}
1325+
if (is_first) {
1326+
assert (line_start[0] == '\n');
1327+
line_start++; // skip the first newline
1328+
}
1329+
else {
1330+
// Example: df"""
1331+
// first part {param} second part
1332+
// next line
1333+
// """"
1334+
// We don't need to dedent the first line in the non-first parts.
1335+
const char *line_end = memchr(line_start, '\n', s_end - line_start);
1336+
if (line_end) {
1337+
line_end++; // include the newline
1338+
}
1339+
else {
1340+
line_end = s_end;
1341+
}
1342+
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
1343+
PyUnicodeWriter_Discard(w);
1344+
return NULL;
1345+
}
1346+
line_start = line_end;
1347+
}
1348+
1349+
while (line_start < s + len) {
1350+
lineno++;
1351+
1352+
Py_ssize_t i = 0;
1353+
while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
1354+
i++;
1355+
}
1356+
1357+
if (line_start[i] == '\0') { // found an empty line without newline.
1358+
break;
1359+
}
1360+
if (line_start[i] == '\n') { // found an empty line with newline.
1361+
if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
1362+
PyUnicodeWriter_Discard(w);
1363+
return NULL;
1364+
}
1365+
line_start += i+1;
1366+
continue;
1367+
}
1368+
if (i < dedent_count) { // found an invalid indent.
1369+
assert(line_start[i] != indent_char);
1370+
PyUnicodeWriter_Discard(w);
1371+
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
1372+
"d-string line missing valid indentation");
1373+
return NULL;
1374+
}
1375+
1376+
// found a indented line. let's dedent it.
1377+
line_start += i;
1378+
const char *line_end = memchr(line_start, '\n', s_end - line_start);
1379+
if (line_end) {
1380+
line_end++; // include the newline
1381+
}
1382+
else {
1383+
line_end = s_end;
1384+
}
1385+
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
1386+
PyUnicodeWriter_Discard(w);
1387+
return NULL;
1388+
}
1389+
line_start = line_end;
1390+
}
1391+
return PyUnicodeWriter_Finish(w);
1392+
}
1393+
12951394
static expr_ty
1296-
_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) {
1395+
_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
12971396
assert(PyUnicode_CheckExact(constant->v.Constant.value));
12981397

12991398
const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
13001399
if (bstr == NULL) {
13011400
return NULL;
13021401
}
1402+
is_raw = is_raw || strchr(bstr, '\\') == NULL;
13031403

1304-
size_t len;
1305-
if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
1306-
len = 1;
1307-
} else {
1308-
len = strlen(bstr);
1404+
PyObject *str = NULL;
1405+
if (dedent_count > 0) {
1406+
str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
1407+
is_raw, is_first, constant, token);
1408+
}
1409+
else {
1410+
str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
13091411
}
13101412

1311-
is_raw = is_raw || strchr(bstr, '\\') == NULL;
1312-
PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token);
13131413
if (str == NULL) {
13141414
_Pypegen_raise_decode_error(p);
13151415
return NULL;
@@ -1340,12 +1440,74 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
13401440
return NULL;
13411441
}
13421442
int is_raw = strpbrk(quote_str, "rR") != NULL;
1443+
int is_dedent = strpbrk(quote_str, "dD") != NULL;
1444+
int indent_char = 0;
1445+
Py_ssize_t indent_count = 0;
13431446

13441447
asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
13451448
if (seq == NULL) {
13461449
return NULL;
13471450
}
13481451

1452+
if (is_dedent) {
1453+
expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
1454+
if (first_item->kind != Constant_kind
1455+
|| PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
1456+
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
1457+
first_item,
1458+
"d-string must start with a newline"
1459+
);
1460+
return NULL;
1461+
}
1462+
1463+
expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
1464+
if (last_item->kind != Constant_kind) {
1465+
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
1466+
last_item,
1467+
"d-string must end with an indent line"
1468+
);
1469+
return NULL;
1470+
}
1471+
1472+
Py_ssize_t blen;
1473+
const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
1474+
if (bstr == NULL) {
1475+
return NULL;
1476+
}
1477+
1478+
// memrchr is GNU extension; use manual loop for portability.
1479+
const char *lastline = bstr + blen;
1480+
while (bstr < lastline) {
1481+
if (lastline[-1] == '\n') {
1482+
break;
1483+
}
1484+
lastline--;
1485+
if (*lastline != ' ' && *lastline != '\t') {
1486+
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
1487+
last_item,
1488+
"d-string must end with an indent line"
1489+
);
1490+
return NULL;
1491+
}
1492+
}
1493+
1494+
// checks indent of the last line.
1495+
indent_count = bstr + blen - lastline;
1496+
if (indent_count > 0) {
1497+
indent_char = lastline[0];
1498+
1499+
for (Py_ssize_t i = 1; i < indent_count; i++) {
1500+
if (lastline[i] != indent_char) {
1501+
RAISE_ERROR_KNOWN_LOCATION(
1502+
p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
1503+
"inconsistent use of tabs and spaces in indentation"
1504+
);
1505+
return NULL;
1506+
}
1507+
}
1508+
}
1509+
}
1510+
13491511
Py_ssize_t index = 0;
13501512
for (Py_ssize_t i = 0; i < n_items; i++) {
13511513
expr_ty item = asdl_seq_GET(raw_expressions, i);
@@ -1377,7 +1539,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
13771539
}
13781540

13791541
if (item->kind == Constant_kind) {
1380-
item = _PyPegen_decode_fstring_part(p, is_raw, item, b);
1542+
item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
13811543
if (item == NULL) {
13821544
return NULL;
13831545
}

Parser/lexer/lexer.c

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ tok_continuation_line(struct tok_state *tok) {
455455
static int
456456
maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
457457
int saw_b, int saw_r, int saw_u,
458-
int saw_f, int saw_t) {
458+
int saw_f, int saw_t, int saw_d) {
459459
// Supported: rb, rf, rt (in any order)
460460
// Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order)
461461

@@ -480,13 +480,19 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
480480
if (saw_u && saw_t) {
481481
RETURN_SYNTAX_ERROR("u", "t");
482482
}
483+
if (saw_u && saw_d) {
484+
RETURN_SYNTAX_ERROR("u", "d");
485+
}
483486

484487
if (saw_b && saw_f) {
485488
RETURN_SYNTAX_ERROR("b", "f");
486489
}
487490
if (saw_b && saw_t) {
488491
RETURN_SYNTAX_ERROR("b", "t");
489492
}
493+
if (saw_b && saw_d) {
494+
RETURN_SYNTAX_ERROR("b", "d");
495+
}
490496

491497
if (saw_f && saw_t) {
492498
RETURN_SYNTAX_ERROR("f", "t");
@@ -741,8 +747,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
741747
/* Identifier (most frequent token!) */
742748
nonascii = 0;
743749
if (is_potential_identifier_start(c)) {
744-
/* Process the various legal combinations of b"", r"", u"", and f"". */
745-
int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0;
750+
/* Process the various legal combinations of b"", r"", u"", f"", and d"". */
751+
int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0, saw_d = 0;
746752
while (1) {
747753
if (!saw_b && (c == 'b' || c == 'B')) {
748754
saw_b = 1;
@@ -762,14 +768,17 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
762768
else if (!saw_t && (c == 't' || c == 'T')) {
763769
saw_t = 1;
764770
}
771+
else if (!saw_d && (c == 'd' || c == 'D')) {
772+
saw_d = 1;
773+
}
765774
else {
766775
break;
767776
}
768777
c = tok_nextc(tok);
769778
if (c == '"' || c == '\'') {
770779
// Raise error on incompatible string prefixes:
771780
int status = maybe_raise_syntax_error_for_string_prefixes(
772-
tok, saw_b, saw_r, saw_u, saw_f, saw_t);
781+
tok, saw_b, saw_r, saw_u, saw_f, saw_t, saw_d);
773782
if (status < 0) {
774783
return MAKE_TOKEN(ERRORTOKEN);
775784
}
@@ -1049,7 +1058,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
10491058
}
10501059

10511060
f_string_quote:
1052-
if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't')
1061+
if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't' || Py_TOLOWER(*tok->start) == 'd')
10531062
&& (c == '\'' || c == '"'))) {
10541063

10551064
int quote = c;
@@ -1089,6 +1098,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
10891098
the_current_tok->kind = TOK_FSTRING_MODE;
10901099
the_current_tok->quote = quote;
10911100
the_current_tok->quote_size = quote_size;
1101+
the_current_tok->raw = 0;
10921102
the_current_tok->start = tok->start;
10931103
the_current_tok->multi_line_start = tok->line_start;
10941104
the_current_tok->first_line = tok->lineno;
@@ -1101,25 +1111,28 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
11011111
the_current_tok->in_debug = 0;
11021112

11031113
enum string_kind_t string_kind = FSTRING;
1104-
switch (*tok->start) {
1105-
case 'T':
1106-
case 't':
1107-
the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1108-
string_kind = TSTRING;
1109-
break;
1110-
case 'F':
1111-
case 'f':
1112-
the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1113-
break;
1114-
case 'R':
1115-
case 'r':
1116-
the_current_tok->raw = 1;
1117-
if (Py_TOLOWER(*(tok->start + 1)) == 't') {
1114+
for (const char *p = tok->start; *p != c; p++) {
1115+
switch (*p) {
1116+
case 'f':
1117+
case 'F':
1118+
break;
1119+
case 't':
1120+
case 'T':
11181121
string_kind = TSTRING;
1119-
}
1120-
break;
1121-
default:
1122-
Py_UNREACHABLE();
1122+
break;
1123+
case 'r':
1124+
case 'R':
1125+
the_current_tok->raw = 1;
1126+
break;
1127+
case 'd':
1128+
case 'D':
1129+
if (quote_size != 3) {
1130+
return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string"));
1131+
}
1132+
break;
1133+
default:
1134+
Py_UNREACHABLE();
1135+
}
11231136
}
11241137

11251138
the_current_tok->string_kind = string_kind;

0 commit comments

Comments
 (0)