Skip to content

Commit bafbfcf

Browse files
authored
Implement bytecode caching in BasilispImporter (#244)
* Implement bytecode caching in BasilispImporter * Make sure to bootstrap the cached module first * Time import time * Remove unused single quote * Fix MyPy and Lint errors * Factor out into multiple methods to suppress linting errors
1 parent e757b13 commit bafbfcf

File tree

3 files changed

+193
-14
lines changed

3 files changed

+193
-14
lines changed

src/basilisp/compiler.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,11 +1830,15 @@ def to_py_str(t: ast.AST) -> str:
18301830
return codegen.to_source(t)
18311831

18321832

1833-
def compile_and_exec_form(form: LispForm,
1833+
BytecodeCollector = Optional[Callable[[types.CodeType], None]]
1834+
1835+
1836+
def compile_and_exec_form(form: LispForm, # pylint: disable= too-many-arguments
18341837
ctx: CompilerContext,
18351838
module: types.ModuleType,
18361839
source_filename: str = '<REPL Input>',
1837-
wrapped_fn_name: str = _DEFAULT_FN) -> Any:
1840+
wrapped_fn_name: str = _DEFAULT_FN,
1841+
collect_bytecode: Optional[BytecodeCollector] = None) -> Any:
18381842
"""Compile and execute the given form. This function will be most useful
18391843
for the REPL and testing purposes. Returns the result of the executed expression.
18401844
@@ -1867,13 +1871,16 @@ def compile_and_exec_form(form: LispForm,
18671871
runtime.add_generated_python(to_py_str(ast_module))
18681872

18691873
bytecode = compile(ast_module, source_filename, 'exec')
1874+
if collect_bytecode:
1875+
collect_bytecode(bytecode)
18701876
exec(bytecode, module.__dict__)
18711877
return getattr(module, final_wrapped_name)()
18721878

18731879

18741880
def _incremental_compile_module(nodes: MixedNodeStream,
18751881
mod: types.ModuleType,
1876-
source_filename: str) -> None:
1882+
source_filename: str,
1883+
collect_bytecode: Optional[BytecodeCollector] = None) -> None:
18771884
"""Incrementally compile a stream of AST nodes in module mod.
18781885
18791886
The source_filename will be passed to Python's native compile.
@@ -1891,24 +1898,31 @@ def _incremental_compile_module(nodes: MixedNodeStream,
18911898
runtime.add_generated_python(to_py_str(module))
18921899

18931900
bytecode = compile(module, source_filename, 'exec')
1901+
if collect_bytecode:
1902+
collect_bytecode(bytecode)
18941903
exec(bytecode, mod.__dict__)
18951904

18961905

1897-
def _bootstrap_module(ctx: CompilerContext, mod: types.ModuleType, source_filename: str) -> None:
1906+
def _bootstrap_module(ctx: CompilerContext,
1907+
mod: types.ModuleType,
1908+
source_filename: str,
1909+
collect_bytecode: Optional[BytecodeCollector] = None) -> None:
18981910
"""Bootstrap a new module with imports and other boilerplate."""
18991911
preamble: List[ast.AST] = []
19001912
preamble.extend(_module_imports(ctx))
19011913
preamble.append(_from_module_import())
19021914
preamble.append(_ns_var())
19031915

1904-
_incremental_compile_module(preamble, mod, source_filename=source_filename)
1916+
_incremental_compile_module(
1917+
preamble, mod, source_filename=source_filename, collect_bytecode=collect_bytecode)
19051918
mod.__basilisp_bootstrapped__ = True # type: ignore
19061919

19071920

19081921
def compile_module(forms: Iterable[LispForm],
19091922
ctx: CompilerContext,
19101923
module: types.ModuleType,
1911-
source_filename: str) -> None:
1924+
source_filename: str,
1925+
collect_bytecode: Optional[BytecodeCollector] = None) -> None:
19121926
"""Compile an entire Basilisp module into Python bytecode which can be
19131927
executed as a Python module.
19141928
@@ -1920,7 +1934,23 @@ def compile_module(forms: Iterable[LispForm],
19201934

19211935
for form in forms:
19221936
nodes = [node for node in _to_ast(ctx, form)]
1923-
_incremental_compile_module(nodes, module, source_filename=source_filename)
1937+
_incremental_compile_module(
1938+
nodes, module, source_filename=source_filename, collect_bytecode=collect_bytecode)
1939+
1940+
1941+
def compile_bytecode(code: List[types.CodeType],
1942+
ctx: CompilerContext,
1943+
module: types.ModuleType,
1944+
source_filename: str) -> None:
1945+
"""Compile cached bytecode into the given module.
1946+
1947+
The Basilisp import hook attempts to cache bytecode while compiling Basilisp
1948+
namespaces. When the cached bytecode is reloaded from disk, it needs to be
1949+
compiled within a bootstrapped module. This function bootstraps the module
1950+
and then proceeds to compile a collection of bytecodes into the module."""
1951+
_bootstrap_module(ctx, module, source_filename)
1952+
for bytecode in code:
1953+
exec(bytecode, module.__dict__)
19241954

19251955

19261956
lrepr = basilisp.lang.util.lrepr

src/basilisp/importer.py

Lines changed: 142 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,89 @@
11
import importlib.machinery
22
import importlib.util
33
import logging
4+
import marshal
5+
import os
46
import os.path
57
import sys
68
import types
79
from importlib.abc import MetaPathFinder, SourceLoader
8-
from typing import Optional
10+
from typing import Optional, List, Dict
911

1012
import basilisp.compiler as compiler
1113
import basilisp.lang.runtime as runtime
1214
import basilisp.reader as reader
1315
from basilisp.lang.util import demunge
16+
from basilisp.util import timed
17+
18+
MAGIC_NUMBER = (1149).to_bytes(2, 'little') + b'\r\n'
1419

1520
logger = logging.getLogger(__name__)
1621

1722

23+
def _r_long(int_bytes: bytes) -> int:
24+
"""Convert 4 bytes in little-endian to an integer."""
25+
return int.from_bytes(int_bytes, 'little')
26+
27+
28+
def _w_long(x: int) -> bytes:
29+
"""Convert a 32-bit integer to little-endian."""
30+
return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little')
31+
32+
33+
def _basilisp_bytecode(mtime: int,
34+
source_size: int,
35+
code: List[types.CodeType]) -> bytes:
36+
"""Return the bytes for a Basilisp bytecode cache file."""
37+
data = bytearray(MAGIC_NUMBER)
38+
data.extend(_w_long(mtime))
39+
data.extend(_w_long(source_size))
40+
data.extend(marshal.dumps(code)) # type: ignore
41+
return data
42+
43+
44+
def _get_basilisp_bytecode(fullname: str,
45+
mtime: int,
46+
source_size: int,
47+
cache_data: bytes) -> List[types.CodeType]:
48+
"""Unmarshal the bytes from a Basilisp bytecode cache file, validating the
49+
file header prior to returning. If the file header does not match, throw
50+
an exception."""
51+
exc_details = {'name': fullname}
52+
magic = cache_data[:4]
53+
raw_timestamp = cache_data[4:8]
54+
raw_size = cache_data[8:12]
55+
if magic != MAGIC_NUMBER:
56+
message = f"Incorrect magic number ({magic}) in {fullname}; expected {MAGIC_NUMBER}"
57+
logger.debug(message)
58+
raise ImportError(message, **exc_details) # type: ignore
59+
elif len(raw_timestamp) != 4:
60+
message = f"Reached EOF while reading timestamp in {fullname}"
61+
logger.debug(message)
62+
raise EOFError(message)
63+
elif _r_long(raw_timestamp) != mtime:
64+
message = f"Non-matching timestamp ({_r_long(raw_timestamp)}) in {fullname} bytecode cache; expected {mtime}"
65+
logger.debug(message)
66+
raise ImportError(message, **exc_details) # type: ignore
67+
elif len(raw_size) != 4:
68+
message = f"Reached EOF while reading size of source in {fullname}"
69+
logger.debug(message)
70+
raise EOFError(message)
71+
elif _r_long(raw_size) != source_size:
72+
message = f"Non-matching filesize ({_r_long(raw_size)}) in {fullname} bytecode cache; expected {source_size}"
73+
logger.debug(message)
74+
raise ImportError(message, **exc_details) # type: ignore
75+
76+
return marshal.loads(cache_data[12:]) # type: ignore
77+
78+
79+
def _cache_from_source(path: str) -> str:
80+
"""Return the path to the cached file for the given path. The original path
81+
does not have to exist."""
82+
cache_path, cache_file = os.path.split(importlib.util.cache_from_source(path))
83+
filename, _ = os.path.splitext(cache_file)
84+
return os.path.join(cache_path, filename + '.lpyc')
85+
86+
1887
class BasilispImporter(MetaPathFinder, SourceLoader):
1988
"""Python import hook to allow directly loading Basilisp code within
2089
Python."""
@@ -41,7 +110,13 @@ def find_spec(self,
41110
f"{os.path.join(entry, *module_name)}.lpy"]
42111
for filename in filenames:
43112
if os.path.exists(filename):
44-
state = {'fullname': fullname, "filename": filename, 'path': entry, 'target': target}
113+
state = {
114+
'fullname': fullname,
115+
'filename': filename,
116+
'path': entry,
117+
'target': target,
118+
'cache_filename': _cache_from_source(filename)
119+
}
45120
logger.debug(f"Found potential Basilisp module '{fullname}' in file '{filename}'")
46121
return importlib.machinery.ModuleSpec(fullname, self, origin=filename, loader_state=state)
47122
return None
@@ -50,10 +125,22 @@ def invalidate_caches(self):
50125
super().invalidate_caches()
51126
self._cache = {}
52127

53-
def get_data(self, path) -> bytes:
128+
def _cache_bytecode(self, source_path, cache_path, data): # pylint: disable=unused-argument
129+
self.set_data(cache_path, data)
130+
131+
def path_stats(self, path):
132+
stat = os.stat(path)
133+
return {'mtime': int(stat.st_mtime), 'size': stat.st_size}
134+
135+
def get_data(self, path):
54136
with open(path, mode='r+b') as f:
55137
return f.read()
56138

139+
def set_data(self, path, data):
140+
os.makedirs(os.path.dirname(path), exist_ok=True)
141+
with open(path, mode='w+b') as f:
142+
f.write(data)
143+
57144
def get_filename(self, fullname: str) -> str:
58145
try:
59146
cached = self._cache[fullname]
@@ -72,6 +159,50 @@ def create_module(self, spec: importlib.machinery.ModuleSpec):
72159
self._cache[spec.name] = {"spec": spec}
73160
return mod
74161

162+
def _exec_cached_module(self,
163+
fullname: str,
164+
loader_state: Dict[str, str],
165+
path_stats: Dict[str, int],
166+
module: types.ModuleType):
167+
"""Load and execute a cached Basilisp module."""
168+
filename = loader_state["filename"]
169+
cache_filename = loader_state["cache_filename"]
170+
171+
with timed(lambda duration: logger.debug(
172+
f"Loaded cached Basilisp module '{fullname}' in {duration / 1000000}ms")):
173+
logger.debug(f"Checking for cached Basilisp module '{fullname}''")
174+
cache_data = self.get_data(cache_filename)
175+
cached_code = _get_basilisp_bytecode(fullname, path_stats['mtime'], path_stats['size'], cache_data)
176+
compiler.compile_bytecode(cached_code, compiler.CompilerContext(), module, filename)
177+
178+
def _exec_module(self,
179+
fullname: str,
180+
loader_state: Dict[str, str],
181+
path_stats: Dict[str, int],
182+
module: types.ModuleType):
183+
"""Load and execute a non-cached Basilisp module."""
184+
filename = loader_state["filename"]
185+
cache_filename = loader_state["cache_filename"]
186+
187+
with timed(lambda duration: logger.debug(
188+
f"Loaded Basilisp module '{fullname}' in {duration / 1000000}ms")):
189+
# During compilation, bytecode objects are added to the list via the closure
190+
# add_bytecode below, which is passed to the compiler. The collected bytecodes
191+
# will be used to generate an .lpyc file for caching the compiled file.
192+
all_bytecode = []
193+
194+
def add_bytecode(bytecode: types.CodeType):
195+
all_bytecode.append(bytecode)
196+
197+
logger.debug(f"Reading and compiling Basilisp module '{fullname}'")
198+
forms = reader.read_file(filename, resolver=runtime.resolve_alias)
199+
compiler.compile_module( # pylint: disable=unexpected-keyword-arg
200+
forms, compiler.CompilerContext(), module, filename, collect_bytecode=add_bytecode)
201+
202+
# Cache the bytecode that was collected through the compilation run.
203+
cache_file_bytes = _basilisp_bytecode(path_stats['mtime'], path_stats['size'], all_bytecode)
204+
self._cache_bytecode(filename, cache_filename, cache_file_bytes)
205+
75206
def exec_module(self, module):
76207
"""Compile the Basilisp module into Python code.
77208
@@ -84,6 +215,7 @@ def exec_module(self, module):
84215
cached["module"] = module
85216
spec = cached["spec"]
86217
filename = spec.loader_state["filename"]
218+
path_stats = self.path_stats(filename)
87219

88220
# During the bootstrapping process, the 'basilisp.core namespace is created with
89221
# a blank module. If we do not replace the module here with the module we are
@@ -93,10 +225,13 @@ def exec_module(self, module):
93225
ns: runtime.Namespace = runtime.set_current_ns(ns_name).value
94226
ns.module = module
95227

96-
logger.debug(f"Reading and compiling Basilisp module '{fullname}''")
97-
forms = reader.read_file(filename, resolver=runtime.resolve_alias)
98-
compiler.compile_module(forms, compiler.CompilerContext(), module, filename)
99-
logger.debug(f"Loaded Basilisp module '{fullname}''")
228+
# Check if a valid, cached version of this Basilisp namespace exists and, if so,
229+
# load it and bypass the expensive compilation process below.
230+
try:
231+
self._exec_cached_module(fullname, spec.loader_state, path_stats, module)
232+
except (EOFError, ImportError, IOError, OSError) as e:
233+
logger.debug(f"Failed to load cached Basilisp module: {e}")
234+
self._exec_module(fullname, spec.loader_state, path_stats, module)
100235

101236
# Because we want to (by default) add 'basilisp.core into every namespace by default,
102237
# we want to make sure we don't try to add 'basilisp.core into itself, causing a

src/basilisp/util.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import contextlib
12
import functools
23
import inspect
34
import os.path
5+
import time
46
from typing import Optional, Callable, TypeVar, Generic
57

68
from functional import seq
@@ -34,6 +36,18 @@ def wrapper(*args, **kwargs):
3436
return wrapper
3537

3638

39+
@contextlib.contextmanager
40+
def timed(f: Optional[Callable[[int], None]] = None):
41+
"""Time the execution of code in the with-block, calling the function
42+
f (if it is given) with the resulting time in nanoseconds."""
43+
start = time.perf_counter()
44+
yield
45+
end = time.perf_counter()
46+
if f:
47+
ns = int((end - start) * 1000000000)
48+
f(ns)
49+
50+
3751
T = TypeVar('T')
3852
U = TypeVar('U')
3953

0 commit comments

Comments
 (0)