Skip to content

Commit 21f3f3f

Browse files
committed
Optimize CPython PEG parser: 29% speedup on stdlib parsing
Stack of optimizations to the PEG parser that together yield a ~29% throughput improvement (4135ms -> 2924ms parsing 1867 stdlib files): 1. Reduce stack overflow check frequency: only call _Py_ReachedRecursionLimitWithMargin every 100 recursion levels instead of every single rule entry (+16%) 2. Lazy token bytes: defer PyBytes_FromStringAndSize allocation until token bytes are actually accessed, avoiding allocation for tokens that are only type-checked (+2% incremental) 3. Inline _PyPegen_expect_token: move the hottest function (called for every token match) from pegen.c to a static inline in pegen.h, eliminating function call overhead (+3% incremental) 4. Inline _PyPegen_is_memoized: move memo lookup (called for every memoized rule) to a static inline in pegen.h for non-debug builds (+3% incremental) 5. Direct identifier creation: create PyUnicode identifiers directly from raw token pointers (start, len) instead of going through PyBytes intermediary (+5% incremental) https://claude.ai/code/session_0116H8dSsjY7pmMiZs5WWjF3
1 parent f615556 commit 21f3f3f

File tree

6 files changed

+629
-506
lines changed

6 files changed

+629
-506
lines changed

Parser/action_helpers.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,7 @@ _PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc)
902902
if (tc == NULL) {
903903
return a;
904904
}
905-
const char *bytes = PyBytes_AsString(tc->bytes);
905+
const char *bytes = PyBytes_AsString(_PyPegen_token_bytes(p, tc));
906906
if (bytes == NULL) {
907907
return NULL;
908908
}
@@ -919,10 +919,9 @@ _PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc)
919919
0 indicates success and nonzero indicates failure (an exception may be set) */
920920
int
921921
_PyPegen_check_barry_as_flufl(Parser *p, Token* t) {
922-
assert(t->bytes != NULL);
923922
assert(t->type == NOTEQUAL);
924923

925-
const char* tok_str = PyBytes_AS_STRING(t->bytes);
924+
const char* tok_str = PyBytes_AS_STRING(_PyPegen_token_bytes(p, t));
926925
if (p->flags & PyPARSE_BARRY_AS_BDFL && strcmp(tok_str, "<>") != 0) {
927926
RAISE_SYNTAX_ERROR("with Barry as BDFL, use '<>' instead of '!='");
928927
return -1;
@@ -1306,7 +1305,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
13061305
}
13071306
}
13081307

1309-
const char* quote_str = PyBytes_AsString(a->bytes);
1308+
const char* quote_str = PyBytes_AsString(_PyPegen_token_bytes(p, a));
13101309
if (quote_str == NULL) {
13111310
return NULL;
13121311
}
@@ -1401,7 +1400,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b
14011400
expr_ty _PyPegen_decoded_constant_from_token(Parser* p, Token* tok) {
14021401
Py_ssize_t bsize;
14031402
char* bstr;
1404-
if (PyBytes_AsStringAndSize(tok->bytes, &bstr, &bsize) == -1) {
1403+
if (PyBytes_AsStringAndSize(_PyPegen_token_bytes(p, tok), &bstr, &bsize) == -1) {
14051404
return NULL;
14061405
}
14071406

@@ -1426,7 +1425,7 @@ expr_ty _PyPegen_decoded_constant_from_token(Parser* p, Token* tok) {
14261425
}
14271426

14281427
expr_ty _PyPegen_constant_from_token(Parser* p, Token* tok) {
1429-
char* bstr = PyBytes_AsString(tok->bytes);
1428+
char* bstr = PyBytes_AsString(_PyPegen_token_bytes(p, tok));
14301429
if (bstr == NULL) {
14311430
return NULL;
14321431
}
@@ -1444,7 +1443,7 @@ expr_ty _PyPegen_constant_from_token(Parser* p, Token* tok) {
14441443
}
14451444

14461445
expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok) {
1447-
char* the_str = PyBytes_AsString(tok->bytes);
1446+
char* the_str = PyBytes_AsString(_PyPegen_token_bytes(p, tok));
14481447
if (the_str == NULL) {
14491448
return NULL;
14501449
}

0 commit comments

Comments
 (0)