Skip to content

Commit 657bdd8

Browse files
authored
More efficient (fixed-format) serialization (#19668)
This makes deserialization ~2.5x faster compared to `orjson`. This is fully functional, but still PoC in terms of distribution logic. Some comments: * In you want to try this in compiled mode, simply install mypy using `MYPY_USE_MYPYC=1`, this will install the extension automatically (for now) * If you want to just play with the extension, or use it in interpreted mode, use `pip install mypyc/lib-rt` * I translated (de-)serialization logic from JSON methods almost verbatim (including comments) * This may be still not the most efficient way to do this, but I wanted to write something simple, that probably still gets us 90% there in terms of performance. I am still open to suggestions however * Please forgive me if the PR looks not very polished, I feel tired, but needed some kind of closure on this :-) Some technical notes: * The huge `try/except` import blob in `mypy/cache.py` is temporary, it is needed for now to be able to run tests without installing mypy itself (only with `test-requirements.txt`). * There is certain asymmetry with read/write for literals, this is intentional because we allow `complex` and/or `None` in some cases, but not in other cases. * General convention is that during deserialization the type/symbol marker is consumer by the caller (except for `MypyFile`, which is special). There is no convention for few classes that are not types/symbols. * I add new primitive type for `native_internal.Buffer` (and possible more type in future from `native`) for better/automatic method call specializations. If this feels wrong/risky, I can convert this to a more ad-hoc logic in `transform_call_expr()` Related issue: #3456
1 parent 0d23c61 commit 657bdd8

27 files changed

+2099
-67
lines changed

mypy/build.py

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from typing_extensions import TypeAlias as _TypeAlias
4141

4242
import mypy.semanal_main
43+
from mypy.cache import Buffer
4344
from mypy.checker import TypeChecker
4445
from mypy.error_formatter import OUTPUT_CHOICES, ErrorFormatter
4546
from mypy.errors import CompileError, ErrorInfo, Errors, report_internal_error
@@ -1143,6 +1144,17 @@ def read_deps_cache(manager: BuildManager, graph: Graph) -> dict[str, FgDepMeta]
11431144
return module_deps_metas
11441145

11451146

1147+
def _load_ff_file(file: str, manager: BuildManager, log_error: str) -> bytes | None:
1148+
t0 = time.time()
1149+
try:
1150+
data = manager.metastore.read(file)
1151+
except OSError:
1152+
manager.log(log_error + file)
1153+
return None
1154+
manager.add_stats(metastore_read_time=time.time() - t0)
1155+
return data
1156+
1157+
11461158
def _load_json_file(
11471159
file: str, manager: BuildManager, log_success: str, log_error: str
11481160
) -> dict[str, Any] | None:
@@ -1263,7 +1275,11 @@ def get_cache_names(id: str, path: str, options: Options) -> tuple[str, str, str
12631275
deps_json = None
12641276
if options.cache_fine_grained:
12651277
deps_json = prefix + ".deps.json"
1266-
return (prefix + ".meta.json", prefix + ".data.json", deps_json)
1278+
if options.fixed_format_cache:
1279+
data_suffix = ".data.ff"
1280+
else:
1281+
data_suffix = ".data.json"
1282+
return (prefix + ".meta.json", prefix + data_suffix, deps_json)
12671283

12681284

12691285
def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | None:
@@ -1563,8 +1579,13 @@ def write_cache(
15631579
tree.path = path
15641580

15651581
# Serialize data and analyze interface
1566-
data = tree.serialize()
1567-
data_bytes = json_dumps(data, manager.options.debug_cache)
1582+
if manager.options.fixed_format_cache:
1583+
data_io = Buffer()
1584+
tree.write(data_io)
1585+
data_bytes = data_io.getvalue()
1586+
else:
1587+
data = tree.serialize()
1588+
data_bytes = json_dumps(data, manager.options.debug_cache)
15681589
interface_hash = hash_digest(data_bytes)
15691590

15701591
plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False))
@@ -2089,15 +2110,23 @@ def load_tree(self, temporary: bool = False) -> None:
20892110
self.meta is not None
20902111
), "Internal error: this method must be called only for cached modules"
20912112

2092-
data = _load_json_file(
2093-
self.meta.data_json, self.manager, "Load tree ", "Could not load tree: "
2094-
)
2113+
data: bytes | dict[str, Any] | None
2114+
if self.options.fixed_format_cache:
2115+
data = _load_ff_file(self.meta.data_json, self.manager, "Could not load tree: ")
2116+
else:
2117+
data = _load_json_file(
2118+
self.meta.data_json, self.manager, "Load tree ", "Could not load tree: "
2119+
)
20952120
if data is None:
20962121
return
20972122

20982123
t0 = time.time()
20992124
# TODO: Assert data file wasn't changed.
2100-
self.tree = MypyFile.deserialize(data)
2125+
if isinstance(data, bytes):
2126+
data_io = Buffer(data)
2127+
self.tree = MypyFile.read(data_io)
2128+
else:
2129+
self.tree = MypyFile.deserialize(data)
21012130
t1 = time.time()
21022131
self.manager.add_stats(deserialize_time=t1 - t0)
21032132
if not temporary:
@@ -2485,7 +2514,11 @@ def write_cache(self) -> None:
24852514
):
24862515
if self.options.debug_serialize:
24872516
try:
2488-
self.tree.serialize()
2517+
if self.manager.options.fixed_format_cache:
2518+
data = Buffer()
2519+
self.tree.write(data)
2520+
else:
2521+
self.tree.serialize()
24892522
except Exception:
24902523
print(f"Error serializing {self.id}", file=self.manager.stdout)
24912524
raise # Propagate to display traceback

mypy/cache.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Sequence
4+
from typing import TYPE_CHECKING, Final
5+
6+
try:
7+
from native_internal import (
8+
Buffer as Buffer,
9+
read_bool as read_bool,
10+
read_float as read_float,
11+
read_int as read_int,
12+
read_str as read_str,
13+
write_bool as write_bool,
14+
write_float as write_float,
15+
write_int as write_int,
16+
write_str as write_str,
17+
)
18+
except ImportError:
19+
# TODO: temporary, remove this after we publish mypy-native on PyPI.
20+
if not TYPE_CHECKING:
21+
22+
class Buffer:
23+
def __init__(self, source: bytes = b"") -> None:
24+
raise NotImplementedError
25+
26+
def getvalue(self) -> bytes:
27+
raise NotImplementedError
28+
29+
def read_int(data: Buffer) -> int:
30+
raise NotImplementedError
31+
32+
def write_int(data: Buffer, value: int) -> None:
33+
raise NotImplementedError
34+
35+
def read_str(data: Buffer) -> str:
36+
raise NotImplementedError
37+
38+
def write_str(data: Buffer, value: str) -> None:
39+
raise NotImplementedError
40+
41+
def read_bool(data: Buffer) -> bool:
42+
raise NotImplementedError
43+
44+
def write_bool(data: Buffer, value: bool) -> None:
45+
raise NotImplementedError
46+
47+
def read_float(data: Buffer) -> float:
48+
raise NotImplementedError
49+
50+
def write_float(data: Buffer, value: float) -> None:
51+
raise NotImplementedError
52+
53+
54+
LITERAL_INT: Final = 1
55+
LITERAL_STR: Final = 2
56+
LITERAL_BOOL: Final = 3
57+
LITERAL_FLOAT: Final = 4
58+
LITERAL_COMPLEX: Final = 5
59+
LITERAL_NONE: Final = 6
60+
61+
62+
def read_literal(data: Buffer, marker: int) -> int | str | bool | float:
63+
if marker == LITERAL_INT:
64+
return read_int(data)
65+
elif marker == LITERAL_STR:
66+
return read_str(data)
67+
elif marker == LITERAL_BOOL:
68+
return read_bool(data)
69+
elif marker == LITERAL_FLOAT:
70+
return read_float(data)
71+
assert False, f"Unknown literal marker {marker}"
72+
73+
74+
def write_literal(data: Buffer, value: int | str | bool | float | complex | None) -> None:
75+
if isinstance(value, bool):
76+
write_int(data, LITERAL_BOOL)
77+
write_bool(data, value)
78+
elif isinstance(value, int):
79+
write_int(data, LITERAL_INT)
80+
write_int(data, value)
81+
elif isinstance(value, str):
82+
write_int(data, LITERAL_STR)
83+
write_str(data, value)
84+
elif isinstance(value, float):
85+
write_int(data, LITERAL_FLOAT)
86+
write_float(data, value)
87+
elif isinstance(value, complex):
88+
write_int(data, LITERAL_COMPLEX)
89+
write_float(data, value.real)
90+
write_float(data, value.imag)
91+
else:
92+
write_int(data, LITERAL_NONE)
93+
94+
95+
def read_int_opt(data: Buffer) -> int | None:
96+
if read_bool(data):
97+
return read_int(data)
98+
return None
99+
100+
101+
def write_int_opt(data: Buffer, value: int | None) -> None:
102+
if value is not None:
103+
write_bool(data, True)
104+
write_int(data, value)
105+
else:
106+
write_bool(data, False)
107+
108+
109+
def read_str_opt(data: Buffer) -> str | None:
110+
if read_bool(data):
111+
return read_str(data)
112+
return None
113+
114+
115+
def write_str_opt(data: Buffer, value: str | None) -> None:
116+
if value is not None:
117+
write_bool(data, True)
118+
write_str(data, value)
119+
else:
120+
write_bool(data, False)
121+
122+
123+
def read_int_list(data: Buffer) -> list[int]:
124+
size = read_int(data)
125+
return [read_int(data) for _ in range(size)]
126+
127+
128+
def write_int_list(data: Buffer, value: list[int]) -> None:
129+
write_int(data, len(value))
130+
for item in value:
131+
write_int(data, item)
132+
133+
134+
def read_str_list(data: Buffer) -> list[str]:
135+
size = read_int(data)
136+
return [read_str(data) for _ in range(size)]
137+
138+
139+
def write_str_list(data: Buffer, value: Sequence[str]) -> None:
140+
write_int(data, len(value))
141+
for item in value:
142+
write_str(data, item)
143+
144+
145+
def read_str_opt_list(data: Buffer) -> list[str | None]:
146+
size = read_int(data)
147+
return [read_str_opt(data) for _ in range(size)]
148+
149+
150+
def write_str_opt_list(data: Buffer, value: list[str | None]) -> None:
151+
write_int(data, len(value))
152+
for item in value:
153+
write_str_opt(data, item)

mypy/fixup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ def visit_type_info(self, info: TypeInfo) -> None:
9797
info.declared_metaclass.accept(self.type_fixer)
9898
if info.metaclass_type:
9999
info.metaclass_type.accept(self.type_fixer)
100+
if info.self_type:
101+
info.self_type.accept(self.type_fixer)
100102
if info.alt_promote:
101103
info.alt_promote.accept(self.type_fixer)
102104
instance = Instance(info, [])

mypy/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,6 +1056,9 @@ def add_invertible_flag(
10561056
action="store_true",
10571057
help="Include fine-grained dependency information in the cache for the mypy daemon",
10581058
)
1059+
incremental_group.add_argument(
1060+
"--fixed-format-cache", action="store_true", help=argparse.SUPPRESS
1061+
)
10591062
incremental_group.add_argument(
10601063
"--skip-version-check",
10611064
action="store_true",

mypy/modulefinder.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,7 @@ def default_lib_path(
796796
custom_typeshed_dir = os.path.abspath(custom_typeshed_dir)
797797
typeshed_dir = os.path.join(custom_typeshed_dir, "stdlib")
798798
mypy_extensions_dir = os.path.join(custom_typeshed_dir, "stubs", "mypy-extensions")
799+
mypy_native_dir = os.path.join(custom_typeshed_dir, "stubs", "mypy-native")
799800
versions_file = os.path.join(typeshed_dir, "VERSIONS")
800801
if not os.path.isdir(typeshed_dir) or not os.path.isfile(versions_file):
801802
print(
@@ -811,11 +812,13 @@ def default_lib_path(
811812
data_dir = auto
812813
typeshed_dir = os.path.join(data_dir, "typeshed", "stdlib")
813814
mypy_extensions_dir = os.path.join(data_dir, "typeshed", "stubs", "mypy-extensions")
815+
mypy_native_dir = os.path.join(data_dir, "typeshed", "stubs", "mypy-native")
814816
path.append(typeshed_dir)
815817

816-
# Get mypy-extensions stubs from typeshed, since we treat it as an
817-
# "internal" library, similar to typing and typing-extensions.
818+
# Get mypy-extensions and mypy-native stubs from typeshed, since we treat them as
819+
# "internal" libraries, similar to typing and typing-extensions.
818820
path.append(mypy_extensions_dir)
821+
path.append(mypy_native_dir)
819822

820823
# Add fallback path that can be used if we have a broken installation.
821824
if sys.platform != "win32":

0 commit comments

Comments
 (0)