Skip to content

Commit 4f9741c

Browse files
committed
feat: optimize peg parser
1 parent a8728da commit 4f9741c

File tree

3 files changed

+72
-53
lines changed

3 files changed

+72
-53
lines changed

experiments.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -221,12 +221,17 @@ test_large_file[PegenParser-memoize-all] 2,674.6126 (171.35) 20.5175 (6
221221
test_large_file[PegenParser] 5,312.4585 (217.00) 44.2033 (113.05) 5,325.4162 (218.56) 80.0707 (304.81) 1;0 0.1882 (0.00) 5 1
222222
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
223223

224-
----------------------------------------------------------------------------------------- benchmark 'small-string': 4 tests -----------------------------------------------------------------------------------------
225-
Name (time in us) Mean StdDev Median IQR Outliers OPS Rounds Iterations
226-
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
227-
test_small_string[RuffParser] 6.8298 (1.0) 0.3288 (1.0) 6.7920 (1.0) 0.0840 (1.0) 12;22 146,417.6823 (1.0) 773 1
228-
test_small_string[TreeSitter] 10.2818 (1.51) 1.0306 (3.13) 10.1670 (1.50) 0.1250 (1.49) 10;27 97,259.6897 (0.66) 786 1
229-
test_small_string[PlyParser] 268.5103 (39.31) 13.1569 (40.02) 263.9170 (38.86) 6.8540 (81.60) 15;19 3,724.2525 (0.03) 179 1
230-
test_small_string[PegenParser-memoize-all] 405.4760 (134.34) 41.5348 (180.66) 400.7085 (133.57) 34.6040 (823.40) 13;7 2.4662 (0.01) 440 1
231-
test_small_string[PegenParser] 1,074.9712 (157.39) 55.4020 (168.50) 1,067.2500 (157.13) 13.5200 (160.95) 8;13 930.2574 (0.01) 171 1
232-
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
224+
# after memoize table optimization
225+
226+
- there was around 20% improvement in the runtime
227+
228+
---------------------------------------------------------------------------------------- benchmark 'large-file': 6 tests -----------------------------------------------------------------------------------------
229+
Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations
230+
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
231+
test_large_file[TreeSitter] 10.8569 (1.0) 12.0598 (1.0) 11.0258 (1.0) 0.1734 (1.0) 10.9928 (1.0) 0.1003 (1.0) 6;6 90.6967 (1.0) 80 1
232+
test_large_file[RuffParser] 16.7757 (1.55) 25.4420 (2.11) 21.9133 (1.99) 2.7005 (15.57) 22.6645 (2.06) 2.1841 (21.77) 8;7 45.6345 (0.50) 35 1
233+
test_large_file[PlyParser] 295.7462 (27.24) 306.5685 (25.42) 301.7971 (27.37) 3.9262 (22.64) 302.3860 (27.51) 4.0868 (40.73) 2;0 3.3135 (0.04) 5 1
234+
test_large_file[PegenRustParser] 684.7476 (63.07) 716.6600 (59.43) 699.5411 (63.45) 11.8038 (68.07) 697.3384 (63.44) 14.7329 (146.84) 2;0 1.4295 (0.02) 5 1
235+
test_large_file[PegenParser] 704.1405 (64.86) 745.2963 (61.80) 719.5224 (65.26) 16.0691 (92.66) 715.5075 (65.09) 20.6549 (205.86) 1;0 1.3898 (0.02) 5 1
236+
test_large_file[PegenV0Parser] 2,524.6497 (232.54) 2,760.9785 (228.94) 2,604.5888 (236.23) 91.1026 (525.33) 2,584.9718 (235.15) 78.9500 (786.88) 1;1 0.3839 (0.00) 5 1
237+
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

peg_parser/subheader.py

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
Store = ast.Store()
2323
Del = ast.Del()
2424

25+
_MEMOIZE_COUNTER = [0]
26+
2527
# Node = TypeVar("Node", bound=ast.stmt | ast.expr)
2628

2729

@@ -106,34 +108,36 @@ def logger_wrapper(self: P, *args: object) -> Any:
106108
def memoize(method: F) -> F:
107109
"""Memoize a symbol method."""
108110
method_name = method.__name__
111+
method_id = _MEMOIZE_COUNTER[0]
112+
_MEMOIZE_COUNTER[0] += 1
109113

110114
def memoize_wrapper(self: P) -> Any:
111-
mark = self._mark()
112-
key = mark, method_name
115+
mark = self._tokenizer._index
113116
# Fast path: cache hit, and not verbose.
114-
if key in self._cache and not self._verbose:
115-
tree, endmark = self._cache[key]
116-
self._reset(endmark)
117+
cache = self._caches[method_id]
118+
if mark in cache and not self._verbose:
119+
tree, endmark = cache[mark]
120+
self._tokenizer._index = endmark
117121
return tree
118122
# Slow path: no cache hit, or verbose.
119123
verbose, argsr, fill = self._verbose, "", ""
120124
if verbose:
121125
fill = " " * self._level
122-
if key not in self._cache:
126+
if mark not in cache:
123127
if verbose:
124128
print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})")
125129
self._level += 1
126130
tree = method(self)
127131
if verbose:
128132
self._level -= 1
129133
print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}")
130-
endmark = self._mark()
131-
self._cache[key] = tree, endmark
134+
endmark = self._tokenizer._index
135+
cache[mark] = tree, endmark
132136
else:
133-
tree, endmark = self._cache[key]
137+
tree, endmark = cache[mark]
134138
if verbose:
135139
print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}")
136-
self._reset(endmark)
140+
self._tokenizer._index = endmark
137141
return tree
138142

139143
memoize_wrapper.__wrapped__ = method # type: ignore
@@ -143,20 +147,22 @@ def memoize_wrapper(self: P) -> Any:
143147
def memoize_left_rec(method: Callable[[P], T | None]) -> Callable[[P], T | None]:
144148
"""Memoize a left-recursive symbol method."""
145149
method_name = method.__name__
150+
method_id = _MEMOIZE_COUNTER[0]
151+
_MEMOIZE_COUNTER[0] += 1
146152

147153
def memoize_left_rec_wrapper(self: P) -> T | Any | None:
148-
mark = self._mark()
149-
key = mark, method_name
154+
mark = self._tokenizer._index
150155
# Fast path: cache hit, and not verbose.
151-
if key in self._cache and not self._verbose:
152-
tree, endmark = self._cache[key]
153-
self._reset(endmark)
156+
cache = self._caches[method_id]
157+
if mark in cache and not self._verbose:
158+
tree, endmark = cache[mark]
159+
self._tokenizer._index = endmark
154160
return tree
155161
# Slow path: no cache hit, or verbose.
156162
verbose, fill = self._verbose, ""
157163
if verbose:
158164
fill = " " * self._level
159-
if key not in self._cache:
165+
if mark not in cache:
160166
if verbose:
161167
print(f"{fill}{method_name} ... (looking at {self.showpeek()})")
162168
self._level += 1
@@ -170,21 +176,21 @@ def memoize_left_rec_wrapper(self: P) -> T | Any | None:
170176
# (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08).
171177

172178
# Prime the cache with a failure.
173-
self._cache[key] = None, mark
179+
cache[mark] = None, mark
174180
lastresult: Any = None
175181
lastmark = mark
176182
depth = 0
177183
if verbose:
178184
print(f"{fill}Recursive {method_name} at {mark} depth {depth}")
179185

180186
while True:
181-
self._reset(mark)
187+
self._tokenizer._index = mark
182188
self.in_recursive_rule += 1
183189
try:
184190
result = method(self)
185191
finally:
186192
self.in_recursive_rule -= 1
187-
endmark = self._mark()
193+
endmark = self._tokenizer._index
188194
depth += 1
189195
if verbose:
190196
print(
@@ -198,26 +204,26 @@ def memoize_left_rec_wrapper(self: P) -> T | Any | None:
198204
if verbose:
199205
print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}")
200206
break
201-
self._cache[key] = lastresult, lastmark = result, endmark
207+
cache[mark] = lastresult, lastmark = result, endmark
202208

203-
self._reset(lastmark)
209+
self._tokenizer._index = lastmark
204210
tree = lastresult
205211

206212
if verbose:
207213
self._level -= 1
208214
print(f"{fill}{method_name}() -> {tree!s:.200} [cached]")
209215
if tree:
210-
endmark = self._mark()
216+
endmark = self._tokenizer._index
211217
else:
212218
endmark = mark
213-
self._reset(endmark)
214-
self._cache[key] = tree, endmark
219+
self._tokenizer._index = endmark
220+
cache[mark] = tree, endmark
215221
else:
216-
tree, endmark = self._cache[key]
222+
tree, endmark = cache[mark]
217223
if verbose:
218224
print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]")
219225
if tree:
220-
self._reset(endmark)
226+
self._tokenizer._index = endmark
221227
return tree
222228

223229
memoize_left_rec_wrapper.__wrapped__ = method # type: ignore
@@ -260,7 +266,7 @@ class Parser:
260266
"_tokenizer",
261267
"_verbose",
262268
"_level",
263-
"_cache",
269+
"_caches",
264270
"tok_cls",
265271
"in_recursive_rule",
266272
"_path_token",
@@ -285,7 +291,9 @@ def __init__(
285291
self._tokenizer = tokenizer
286292
self._verbose = verbose
287293
self._level = 0
288-
self._cache: dict[tuple[Mark, str], tuple[Any, Mark]] = {}
294+
self._verbose = verbose
295+
self._level = 0
296+
self._caches: list[dict[int, tuple[Any, Mark]]] = [{} for _ in range(_MEMOIZE_COUNTER[0] + 10)]
289297
self.tok_cls = tokenizer.tok_cls
290298

291299
# Integer tracking wether we are in a left recursive rule or not. Can be useful
@@ -295,16 +303,18 @@ def __init__(
295303
# handle path literal joined-str
296304
self._path_token: TokenInfo | None = None
297305

298-
# Pass through common tokenizer methods.
299-
self._mark = self._tokenizer.mark
300-
self._reset = self._tokenizer.reset
301-
302306
# Are we looking for syntax error ? When true enable matching on invalid rules
303307
self.call_invalid_rules = False
304308

305309
self.filename = filename
306310
self.py_version = min(py_version, sys.version_info) if py_version else sys.version_info
307311

312+
def _mark(self) -> Mark:
313+
return self._tokenizer._index
314+
315+
def _reset(self, index: Mark) -> None:
316+
self._tokenizer._index = index
317+
308318
def showpeek(self) -> str:
309319
tok = self._tokenizer.peek()
310320
return f"{tok.start[0]}.{tok.start[1]}: {tok.type}:{tok.string!r}"
@@ -344,12 +354,12 @@ def expect(self, typ: str) -> TokenInfo | None:
344354
return None
345355

346356
def repeated(self, func: Callable[..., TR | None], *args: Any) -> list[TR]:
347-
mark = self._mark()
357+
mark = self._tokenizer._index
348358
children = []
349359
while result := func(*args):
350360
children.append(result)
351-
mark = self._mark()
352-
self._reset(mark)
361+
mark = self._tokenizer._index
362+
self._tokenizer._index = mark
353363
return children
354364

355365
def sep_repeated(
@@ -370,24 +380,24 @@ def gathered(
370380
) -> list[TG] | None:
371381
# gather: ','.e+
372382
seq: list[TG]
373-
mark = self._mark()
383+
mark = self._tokenizer._index
374384
if (elem := self.seq_alts(func)) is not None and (
375385
seq := self.repeated(self.sep_repeated, func, sep, *sep_args)
376386
) is not None:
377387
return [elem, *seq]
378-
self._reset(mark)
388+
self._tokenizer._index = mark
379389
return None
380390

381391
def positive_lookahead(self, func: Callable[..., T], *args: object) -> T:
382-
mark = self._mark()
392+
mark = self._tokenizer._index
383393
ok = func(*args)
384-
self._reset(mark)
394+
self._tokenizer._index = mark
385395
return ok
386396

387397
def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool:
388-
mark = self._mark()
398+
mark = self._tokenizer._index
389399
ok = func(*args)
390-
self._reset(mark)
400+
self._tokenizer._index = mark
391401
return not ok
392402

393403
def span(self, lnum: int, col: int) -> SpanDict:
@@ -396,7 +406,7 @@ def span(self, lnum: int, col: int) -> SpanDict:
396406

397407
def seq_alts(self, *alt: Callable[..., T] | tuple[Callable[..., T], Any]) -> T | None:
398408
"""Handle sequence of alts that don't have action associated with them."""
399-
mark = self._mark()
409+
mark = self._tokenizer._index
400410
for arg in alt:
401411
if isinstance(arg, tuple):
402412
method, *args = arg
@@ -405,7 +415,7 @@ def seq_alts(self, *alt: Callable[..., T] | tuple[Callable[..., T], Any]) -> T |
405415
res = arg()
406416
if res:
407417
return res
408-
self._reset(mark)
418+
self._tokenizer._index = mark
409419
return None
410420

411421
def parse(self, rule: str, call_invalid_rules: bool = False) -> Node | Any | None:
@@ -423,7 +433,8 @@ def parse(self, rule: str, call_invalid_rules: bool = False) -> Node | Any | Non
423433
# Reset the parser cache to be able to restart parsing from the
424434
# beginning.
425435
self._reset(0) # type: ignore
426-
self._cache.clear()
436+
for c in self._caches:
437+
c.clear()
427438

428439
res = getattr(self, rule)()
429440

peg_parser/tokenizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ def _fetch(self) -> TokenInfo:
112112

113113
def peek(self) -> TokenInfo:
114114
"""Return the next token *without* updating the index."""
115+
if self._index < len(self._tokens):
116+
return self._tokens[self._index]
117+
115118
try:
116119
while self._index == len(self._tokens):
117120
# if self._with_macro:

0 commit comments

Comments
 (0)