Skip to content

Commit 21cc35a

Browse files
committed
use least indent instead of closing quote indent
1 parent 10a5073 commit 21cc35a

File tree

5 files changed

+120
-124
lines changed

5 files changed

+120
-124
lines changed

Lib/test/test_dstring.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,6 @@ def test_empty_dstring(self):
2626
]
2727
self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
2828

29-
def test_no_last_newline(self):
30-
exprs = [
31-
"d'''\nhello world'''",
32-
'D"""\nhello world"""',
33-
"df'''\nhello {42}'''",
34-
]
35-
self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
36-
3729
def test_simple_dstring(self):
3830
self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n")
3931
self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n")
@@ -42,6 +34,5 @@ def test_simple_dstring(self):
4234
self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n")
4335

4436

45-
4637
if __name__ == '__main__':
4738
unittest.main()

Objects/unicodeobject.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13480,8 +13480,8 @@ of all lines in the [src, end).
1348013480
It returns the length of the common leading whitespace and sets `output` to
1348113481
point to the beginning of the common leading whitespace if length > 0.
1348213482
*/
13483-
static Py_ssize_t
13484-
search_longest_common_leading_whitespace(
13483+
Py_ssize_t
13484+
_Py_search_longest_common_leading_whitespace(
1348513485
const char *const src,
1348613486
const char *const end,
1348713487
const char **output)
@@ -13576,7 +13576,7 @@ _PyUnicode_Dedent(PyObject *unicode)
1357613576
// [whitespace_start, whitespace_start + whitespace_len)
1357713577
// describes the current longest common leading whitespace
1357813578
const char *whitespace_start = NULL;
13579-
Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13579+
Py_ssize_t whitespace_len = _Py_search_longest_common_leading_whitespace(
1358013580
src, end, &whitespace_start);
1358113581

1358213582
if (whitespace_len == 0) {

Parser/action_helpers.c

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1311,8 +1311,8 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start,
13111311
}
13121312

13131313
static PyObject*
1314-
_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
1315-
int is_raw, int is_first, expr_ty constant, Token* token)
1314+
_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
1315+
int is_first, int is_raw, expr_ty constant, Token* token)
13161316
{
13171317
Py_ssize_t lineno = constant->lineno;
13181318
const char *line_start = s;
@@ -1350,7 +1350,7 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
13501350
lineno++;
13511351

13521352
Py_ssize_t i = 0;
1353-
while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
1353+
while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) {
13541354
i++;
13551355
}
13561356

@@ -1365,8 +1365,8 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
13651365
line_start += i+1;
13661366
continue;
13671367
}
1368-
if (i < dedent_count) { // found an invalid indent.
1369-
assert(line_start[i] != indent_char);
1368+
if (i < indent_len) { // found an invalid indent.
1369+
assert(line_start[i] != indent[i]);
13701370
PyUnicodeWriter_Discard(w);
13711371
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
13721372
"d-string line missing valid indentation");
@@ -1392,7 +1392,10 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
13921392
}
13931393

13941394
static expr_ty
1395-
_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
1395+
_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
1396+
const char *indent, Py_ssize_t indent_len,
1397+
expr_ty constant, Token* token)
1398+
{
13961399
assert(PyUnicode_CheckExact(constant->v.Constant.value));
13971400

13981401
const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
@@ -1402,9 +1405,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
14021405
is_raw = is_raw || strchr(bstr, '\\') == NULL;
14031406

14041407
PyObject *str = NULL;
1405-
if (dedent_count > 0) {
1406-
str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
1407-
is_raw, is_first, constant, token);
1408+
if (indent_len > 0) {
1409+
str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len,
1410+
is_first, is_raw, constant, token);
14081411
}
14091412
else {
14101413
str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
@@ -1423,6 +1426,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
14231426
p->arena);
14241427
}
14251428

1429+
/* defined in unicodeobject.c */
1430+
extern Py_ssize_t
1431+
_Py_search_longest_common_leading_whitespace(
1432+
const char *const src,
1433+
const char *const end,
1434+
const char **output
1435+
);
1436+
14261437
static asdl_expr_seq *
14271438
_get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
14281439
{
@@ -1441,14 +1452,15 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
14411452
}
14421453
int is_raw = strpbrk(quote_str, "rR") != NULL;
14431454
int is_dedent = strpbrk(quote_str, "dD") != NULL;
1444-
int indent_char = 0;
1445-
Py_ssize_t indent_count = 0;
14461455

14471456
asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
14481457
if (seq == NULL) {
14491458
return NULL;
14501459
}
14511460

1461+
const char *common_indent_start = NULL;
1462+
Py_ssize_t common_indent_len = 0;
1463+
14521464
if (is_dedent) {
14531465
expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
14541466
if (first_item->kind != Constant_kind
@@ -1460,52 +1472,52 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
14601472
return NULL;
14611473
}
14621474

1463-
expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
1464-
if (last_item->kind != Constant_kind) {
1465-
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
1466-
last_item,
1467-
"d-string must end with an indent line"
1468-
);
1475+
// Instead of calculating common indent from all parts,
1476+
// build temporary string and calculate common indent from it.
1477+
PyBytesWriter *w = PyBytesWriter_Create(0);
1478+
if (w == NULL) {
14691479
return NULL;
14701480
}
14711481

1472-
Py_ssize_t blen;
1473-
const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
1474-
if (bstr == NULL) {
1475-
return NULL;
1476-
}
1482+
for (Py_ssize_t i = 0; i < n_items; i++) {
1483+
expr_ty item = asdl_seq_GET(raw_expressions, i);
14771484

1478-
// memrchr is GNU extension; use manual loop for portability.
1479-
const char *lastline = bstr + blen;
1480-
while (bstr < lastline) {
1481-
if (lastline[-1] == '\n') {
1482-
break;
1483-
}
1484-
lastline--;
1485-
if (*lastline != ' ' && *lastline != '\t') {
1486-
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
1487-
last_item,
1488-
"d-string must end with an indent line"
1489-
);
1490-
return NULL;
1485+
if (item->kind == JoinedStr_kind) {
1486+
// Write a placeholder.
1487+
if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
1488+
PyBytesWriter_Discard(w);
1489+
return NULL;
1490+
}
1491+
continue;
14911492
}
1492-
}
1493-
1494-
// checks indent of the last line.
1495-
indent_count = bstr + blen - lastline;
1496-
if (indent_count > 0) {
1497-
indent_char = lastline[0];
1498-
1499-
for (Py_ssize_t i = 1; i < indent_count; i++) {
1500-
if (lastline[i] != indent_char) {
1501-
RAISE_ERROR_KNOWN_LOCATION(
1502-
p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
1503-
"inconsistent use of tabs and spaces in indentation"
1504-
);
1493+
if (item->kind == Constant_kind) {
1494+
Py_ssize_t blen;
1495+
const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen);
1496+
if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) {
1497+
PyBytesWriter_Discard(w);
15051498
return NULL;
15061499
}
1500+
continue;
15071501
}
15081502
}
1503+
// Add a terminator to include the last line before the ending quote
1504+
if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
1505+
PyBytesWriter_Discard(w);
1506+
return NULL;
1507+
}
1508+
1509+
// TODO: instead of creating temp_bytes, we could search
1510+
// common index from each part directly. But this need reimplementation
1511+
// of _Py_search_longest_common_leading_whitespace.
1512+
PyObject *temp_bytes = PyBytesWriter_Finish(w);
1513+
if (temp_bytes == NULL) {
1514+
return NULL;
1515+
}
1516+
_PyArena_AddPyObject(p->arena, temp_bytes);
1517+
const char *temp_str = PyBytes_AsString(temp_bytes);
1518+
const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
1519+
common_indent_len = _Py_search_longest_common_leading_whitespace(
1520+
temp_str, temp_end, &common_indent_start);
15091521
}
15101522

15111523
Py_ssize_t index = 0;
@@ -1539,7 +1551,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
15391551
}
15401552

15411553
if (item->kind == Constant_kind) {
1542-
item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
1554+
item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b);
15431555
if (item == NULL) {
15441556
return NULL;
15451557
}

Parser/lexer/lexer.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -490,9 +490,6 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
490490
if (saw_b && saw_t) {
491491
RETURN_SYNTAX_ERROR("b", "t");
492492
}
493-
if (saw_b && saw_d) {
494-
RETURN_SYNTAX_ERROR("b", "d");
495-
}
496493

497494
if (saw_f && saw_t) {
498495
RETURN_SYNTAX_ERROR("f", "t");

0 commit comments

Comments
 (0)