Skip to content

Commit f71d96f

Browse files
committed
Update benchmarks
- Update benchmarks generally - Benchmark against Pydantic V2 instead of V1 - Adds a few additional JSON and MessagePack libraries - Documents the versions of libraries used for each run - Bumps the python version used from 3.9 to 3.11. This made several of the pure-python libraries compared measurably faster. Yay for the faster CPython initiative.
1 parent dceeec3 commit f71d96f

19 files changed

+872
-510
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ support for [JSON](https://json.org), [MessagePack](https://msgpack.org),
4141

4242
- 🔍 **Zero-cost schema validation** using familiar Python type annotations. In
4343
[benchmarks](https://jcristharif.com/msgspec/benchmarks.html) `msgspec`
44-
decodes *and* validates JSON ~2x faster than
44+
decodes *and* validates JSON faster than
4545
[orjson](https://github.com/ijl/orjson) can decode it alone.
4646

4747
-**A speedy Struct type** for representing structured data. If you already
4848
use [dataclasses](https://docs.python.org/3/library/dataclasses.html) or
4949
[attrs](https://www.attrs.org),
5050
[structs](https://jcristharif.com/msgspec/structs.html) should feel familiar.
5151
However, they're
52-
[10-100x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
52+
[5-60x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
5353
for common operations.
5454

5555
All of this is included in a

benchmarks/__init__.py

Whitespace-only changes.

benchmarks/bench_encodings.py

Lines changed: 140 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,149 +1,197 @@
11
from __future__ import annotations
22

3+
import sys
4+
import dataclasses
35
import json
46
import timeit
5-
from typing import List, Union
7+
import importlib.metadata
8+
from typing import Any, Literal, Callable
69

7-
import msgpack
8-
import orjson
9-
import ujson
10-
from generate_data import make_filesystem_data
10+
from .generate_data import make_filesystem_data
1111

1212
import msgspec
1313

1414

15-
class File(msgspec.Struct, tag="file"):
15+
class File(msgspec.Struct, kw_only=True, omit_defaults=True, tag="file"):
1616
name: str
1717
created_by: str
1818
created_at: str
19-
updated_at: str
19+
updated_by: str | None = None
20+
updated_at: str | None = None
2021
nbytes: int
22+
permissions: Literal["READ", "WRITE", "READ_WRITE"]
2123

2224

23-
class Directory(msgspec.Struct, tag="directory"):
25+
class Directory(msgspec.Struct, kw_only=True, omit_defaults=True, tag="directory"):
2426
name: str
2527
created_by: str
2628
created_at: str
27-
updated_at: str
28-
contents: List[Union[File, Directory]]
29+
updated_by: str | None = None
30+
updated_at: str | None = None
31+
contents: list[File | Directory]
2932

3033

31-
def bench(dumps, loads, ndata, schema=None):
32-
data = make_filesystem_data(ndata)
33-
if schema:
34-
data = msgspec.convert(data, schema)
35-
timer = timeit.Timer("func(data)", globals={"func": dumps, "data": data})
36-
n, t = timer.autorange()
37-
dumps_time = t / n
34+
@dataclasses.dataclass
35+
class Benchmark:
36+
label: str
37+
version: str
38+
encode: Callable
39+
decode: Callable
40+
schema: Any = None
3841

39-
data = dumps(data)
42+
def run(self, data: bytes) -> dict:
43+
if self.schema is not None:
44+
data = msgspec.convert(data, self.schema)
45+
timer = timeit.Timer("func(data)", globals={"func": self.encode, "data": data})
46+
n, t = timer.autorange()
47+
encode_time = t / n
4048

41-
timer = timeit.Timer("func(data)", globals={"func": loads, "data": data})
42-
n, t = timer.autorange()
43-
loads_time = t / n
44-
return dumps_time, loads_time
49+
data = self.encode(data)
4550

51+
timer = timeit.Timer("func(data)", globals={"func": self.decode, "data": data})
52+
n, t = timer.autorange()
53+
decode_time = t / n
4654

47-
def bench_msgspec_msgpack(n):
48-
schema = File if n == 1 else Directory
49-
enc = msgspec.msgpack.Encoder()
50-
dec = msgspec.msgpack.Decoder(schema)
51-
return bench(enc.encode, dec.decode, n, schema)
52-
53-
54-
def bench_msgspec_json(n):
55-
schema = File if n == 1 else Directory
56-
enc = msgspec.json.Encoder()
57-
dec = msgspec.json.Decoder(schema)
58-
return bench(enc.encode, dec.decode, n, schema)
59-
60-
61-
def bench_msgpack(n):
62-
packer = msgpack.Packer()
63-
loads = msgpack.loads
64-
return bench(packer.pack, loads, n)
55+
return {
56+
"label": self.label,
57+
"encode": encode_time,
58+
"decode": decode_time,
59+
}
6560

6661

67-
def bench_ujson(n):
68-
return bench(ujson.dumps, ujson.loads, n)
62+
def json_benchmarks():
63+
import orjson
64+
import ujson
65+
import rapidjson
66+
import simdjson
6967

68+
simdjson_ver = importlib.metadata.version("pysimdjson")
7069

71-
def bench_orjson(n):
72-
return bench(orjson.dumps, orjson.loads, n)
70+
rj_dumps = rapidjson.Encoder()
71+
rj_loads = rapidjson.Decoder()
7372

73+
def uj_dumps(obj):
74+
return ujson.dumps(obj)
7475

75-
BENCHMARKS = [
76-
("ujson", bench_ujson),
77-
("orjson", bench_orjson),
78-
("msgpack", bench_msgpack),
79-
("msgspec msgpack", bench_msgspec_msgpack),
80-
("msgspec json", bench_msgspec_json),
81-
]
82-
76+
enc = msgspec.json.Encoder()
77+
dec = msgspec.json.Decoder(Directory)
78+
dec2 = msgspec.json.Decoder()
8379

84-
def run(n, quiet=False):
85-
if quiet:
80+
return [
81+
Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
82+
Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
83+
Benchmark("json", None, json.dumps, json.loads),
84+
Benchmark("orjson", orjson.__version__, orjson.dumps, orjson.loads),
85+
Benchmark("ujson", ujson.__version__, uj_dumps, ujson.loads),
86+
Benchmark("rapidjson", rapidjson.__version__, rj_dumps, rj_loads),
87+
Benchmark("simdjson", simdjson_ver, simdjson.dumps, simdjson.loads),
88+
]
8689

87-
def log(x):
88-
pass
8990

90-
else:
91-
log = print
91+
def msgpack_benchmarks():
92+
import msgpack
93+
import ormsgpack
9294

93-
title = f"Benchmark - {n} object{'s' if n > 1 else ''}"
94-
log(title)
95+
enc = msgspec.msgpack.Encoder()
96+
dec = msgspec.msgpack.Decoder(Directory)
97+
dec2 = msgspec.msgpack.Decoder()
9598

96-
results = []
97-
for name, func in BENCHMARKS:
98-
log(name)
99-
dumps_time, loads_time = func(n)
100-
log(f" dumps: {dumps_time * 1e6:.2f} us")
101-
log(f" loads: {loads_time * 1e6:.2f} us")
102-
log(f" total: {(dumps_time + loads_time) * 1e6:.2f} us")
103-
results.append((name, dumps_time, loads_time))
104-
return results
99+
return [
100+
Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
101+
Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
102+
Benchmark("msgpack", msgpack.__version__, msgpack.dumps, msgpack.loads),
103+
Benchmark(
104+
"ormsgpack", ormsgpack.__version__, ormsgpack.packb, ormsgpack.unpackb
105+
),
106+
]
105107

106108

107109
def main():
108110
import argparse
109111

110-
bench_names = ["1", "1k"]
111-
112112
parser = argparse.ArgumentParser(
113-
description="Benchmark different python serializers"
113+
description="Benchmark different python serialization libraries"
114114
)
115115
parser.add_argument(
116-
"--benchmark",
117-
"-b",
118-
action="append",
119-
choices=["all", *bench_names],
120-
default=[],
121-
help="which benchmark(s) to run, defaults to 'all'",
116+
"--versions",
117+
action="store_true",
118+
help="Output library version info, and exit immediately",
122119
)
123120
parser.add_argument(
124-
"--json",
125-
action="store_true",
126-
help="whether to output the results as json",
121+
"-n",
122+
type=int,
123+
help="The number of objects in the generated data, defaults to 1000",
124+
default=1000,
127125
)
128126
parser.add_argument(
129-
"--no-gc",
127+
"-p",
128+
"--protocol",
129+
choices=["json", "msgpack"],
130+
default="json",
131+
help="The protocol to benchmark, defaults to JSON",
132+
)
133+
parser.add_argument(
134+
"--json",
130135
action="store_true",
131-
help="whether to disable the gc during benchmarking",
136+
help="whether to output the results as json",
132137
)
133138
args = parser.parse_args()
134139

135-
if "all" in args.benchmark or not args.benchmark:
136-
to_run = bench_names
137-
else:
138-
to_run = sorted(set(args.benchmark))
140+
benchmarks = json_benchmarks() if args.protocol == "json" else msgpack_benchmarks()
141+
142+
if args.versions:
143+
for bench in benchmarks:
144+
if bench.version is not None:
145+
print(f"- {bench.label}: {bench.version}")
146+
sys.exit(0)
139147

140-
results = {}
141-
for bench in to_run:
142-
n = 1000 if bench.startswith("1k") else 1
143-
results[bench] = run(n, quiet=args.json)
148+
data = make_filesystem_data(args.n)
149+
150+
results = [benchmark.run(data) for benchmark in benchmarks]
144151

145152
if args.json:
146-
print(json.dumps(results))
153+
for line in results:
154+
print(json.dumps(line))
155+
else:
156+
# Compose the results table
157+
results.sort(key=lambda row: row["encode"] + row["decode"])
158+
best_et = results[0]["encode"]
159+
best_dt = results[0]["decode"]
160+
best_tt = best_et + best_dt
161+
162+
columns = (
163+
"",
164+
"encode (μs)",
165+
"vs.",
166+
"decode (μs)",
167+
"vs.",
168+
"total (μs)",
169+
"vs.",
170+
)
171+
rows = [
172+
(
173+
r["label"],
174+
f"{1_000_000 * r['encode']:.1f}",
175+
f"{r['encode'] / best_et:.1f}",
176+
f"{1_000_000 * r['decode']:.1f}",
177+
f"{r['decode'] / best_dt:.1f}",
178+
f"{1_000_000 * (r['encode'] + r['decode']):.1f}",
179+
f"{(r['encode'] + r['decode']) / best_tt:.1f}",
180+
)
181+
for r in results
182+
]
183+
widths = tuple(
184+
max(max(map(len, x)), len(c)) for x, c in zip(zip(*rows), columns)
185+
)
186+
row_template = ("|" + (" %%-%ds |" * len(columns))) % widths
187+
header = row_template % tuple(columns)
188+
bar_underline = "+%s+" % "+".join("=" * (w + 2) for w in widths)
189+
bar = "+%s+" % "+".join("-" * (w + 2) for w in widths)
190+
parts = [bar, header, bar_underline]
191+
for r in rows:
192+
parts.append(row_template % r)
193+
parts.append(bar)
194+
print("\n".join(parts))
147195

148196

149197
if __name__ == "__main__":
Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
decode = orjson.loads
4343
"""
4444

45+
RAPIDJSON = """
46+
import rapidjson
47+
decode = rapidjson.loads
48+
"""
49+
4550
SIMDJSON = """
4651
import simdjson
4752
decode = simdjson.loads
@@ -81,15 +86,37 @@ class RepoData(msgspec.Struct, gc=False):
8186

8287

8388
def main():
89+
import argparse
90+
91+
parser = argparse.ArgumentParser(
92+
description="Benchmark decoding a large JSON message using various JSON libraries"
93+
)
94+
parser.add_argument(
95+
"--versions",
96+
action="store_true",
97+
help="Output library version info, and exit immediately",
98+
)
99+
args = parser.parse_args()
100+
84101
benchmarks = [
85-
("json", JSON),
86-
("ujson", UJSON),
87-
("orjson", ORJSON),
88-
("simdjson", SIMDJSON),
89-
("msgspec", MSGSPEC),
90-
("msgspec structs", MSGSPEC_STRUCTS),
102+
("json", None, JSON),
103+
("ujson", "ujson", UJSON),
104+
("orjson", "orjson", ORJSON),
105+
("rapidjson", "python-rapidjson", RAPIDJSON),
106+
("simdjson", "pysimdjson", SIMDJSON),
107+
("msgspec", "msgspec", MSGSPEC),
108+
("msgspec structs", None, MSGSPEC_STRUCTS),
91109
]
92110

111+
if args.versions:
112+
import importlib.metadata
113+
114+
for _, lib, _ in benchmarks:
115+
if lib is not None:
116+
version = importlib.metadata.version(lib)
117+
print(f"- {lib}: {version}")
118+
sys.exit(0)
119+
93120
with tempfile.NamedTemporaryFile() as f:
94121
# Download the repodata.json
95122
resp = requests.get(
@@ -102,7 +129,7 @@ def main():
102129
results = {}
103130
import ast
104131

105-
for lib, setup in benchmarks:
132+
for lib, _, setup in benchmarks:
106133
script = TEMPLATE.format(path=f.name, setup=setup)
107134
# We execute each script in a subprocess to isolate their memory usage
108135
output = subprocess.check_output([sys.executable, "-c", script])

0 commit comments

Comments
 (0)