Skip to content

Commit d1deb0c

Browse files
committed
Adds a benchmark to the documentation
1 parent e2f8c14 commit d1deb0c

File tree

4 files changed

+312
-1
lines changed

4 files changed

+312
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dist/*
88
build/*
99
.eggs/*
1010
*egg-info/*
11+
_doc/examples/_cache/*
1112
_doc/auto_examples/*
1213
_doc/examples/plot_*.png
1314
_doc/_static/require.js

_doc/examples/plot_benchmark_rf.py

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
"""
2+
.. _l-example-benchmark-tree-implementation:
3+
4+
Benchmark of TreeEnsemble implementation
5+
========================================
6+
7+
The following example compares the inference time between
8+
:epkg:`onnxruntime` and :class:`sklearn.ensemble.RandomForestRegressor`,
9+
fow different number of estimators, max depth, and parallelization.
10+
It does it for a fixed number of rows and features.
11+
12+
import and registration of necessary converters
13+
++++++++++++++++++++++++++++++++++++++++++++++++
14+
"""
15+
import pickle
16+
import os
17+
import time
18+
from itertools import product
19+
20+
import matplotlib.pyplot as plt
21+
import numpy
22+
import pandas
23+
from lightgbm import LGBMRegressor
24+
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
25+
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
26+
from onnxruntime import InferenceSession, SessionOptions
27+
from psutil import cpu_count
28+
from pyquickhelper.loghelper import run_cmd
29+
from skl2onnx import to_onnx, update_registered_converter
30+
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
31+
from sklearn import set_config
32+
from sklearn.ensemble import RandomForestRegressor
33+
from tqdm import tqdm
34+
from xgboost import XGBRegressor
35+
36+
37+
def skl2onnx_convert_lightgbm(scope, operator, container):
38+
options = scope.get_options(operator.raw_operator)
39+
if "split" in options:
40+
operator.split = options["split"]
41+
else:
42+
operator.split = None
43+
convert_lightgbm(scope, operator, container)
44+
45+
46+
update_registered_converter(
47+
LGBMRegressor,
48+
"LightGbmLGBMRegressor",
49+
calculate_linear_regressor_output_shapes,
50+
skl2onnx_convert_lightgbm,
51+
options={"split": None},
52+
)
53+
update_registered_converter(
54+
XGBRegressor,
55+
"XGBoostXGBRegressor",
56+
calculate_linear_regressor_output_shapes,
57+
convert_xgboost,
58+
)
59+
60+
# The following instruction reduces the time spent by scikit-learn
61+
# to validate the data.
62+
set_config(assume_finite=True)
63+
64+
##########################################
65+
# Machine details
66+
# +++++++++++++++
67+
68+
69+
print(f"Number of cores: {cpu_count()}")
70+
71+
###############################################
72+
# But this information is not usually enough.
73+
# Let's extract the cache information.
74+
75+
out, err = run_cmd("lscpu")
76+
print(out)
77+
78+
###############################################
79+
# Or with the following command.
80+
out, err = run_cmd("cat /proc/cpuinfo")
81+
82+
###############################################
83+
# Fonction to measure inference time
84+
# ++++++++++++++++++++++++++++++++++
85+
86+
87+
def measure_inference(fct, X, repeat, max_time=5, quantile=1):
88+
"""
89+
Run *repeat* times the same function on data *X*.
90+
91+
:param fct: fonction to run
92+
:param X: data
93+
:param repeat: number of times to run
94+
:param max_time: maximum time to use to measure the inference
95+
:return: number of runs, sum of the time, average, median
96+
"""
97+
times = []
98+
for n in range(repeat):
99+
perf = time.perf_counter()
100+
fct(X)
101+
delta = time.perf_counter() - perf
102+
times.append(delta)
103+
if len(times) < 3:
104+
continue
105+
if max_time is not None and sum(times) >= max_time:
106+
break
107+
times.sort()
108+
quantile = 0 if (len(times) - quantile * 2) < 3 else quantile
109+
if quantile == 0:
110+
tt = times
111+
else:
112+
tt = times[quantile:-quantile]
113+
return (len(times), sum(times), sum(tt) / len(tt), times[len(times) // 2])
114+
115+
116+
###############################################
117+
# Benchmark
118+
# +++++++++
119+
#
120+
# The following script benchmarks the inference for the same
121+
# model for a random forest and onnxruntime after it was converted
122+
# into ONNX and for the following configurations.
123+
124+
legend = "parallel-batch-4096-block"
125+
126+
small = cpu_count() < 12
127+
if small:
128+
N = 1000
129+
n_features = 10
130+
n_jobs = [1, cpu_count() // 2, cpu_count()]
131+
n_ests = [10, 20, 30]
132+
depth = [4, 6, 8, 10]
133+
Regressor = RandomForestRegressor
134+
else:
135+
N = 100000
136+
n_features = 50
137+
n_jobs = [cpu_count(), cpu_count() // 2, 1]
138+
n_ests = [100, 200, 400]
139+
depth = [6, 8, 10, 12, 14]
140+
Regressor = RandomForestRegressor
141+
142+
143+
##############################################
144+
# Benchmark parameters
145+
repeat = 7 # repeat n times the same inference
146+
quantile = 1 # exclude extreme times
147+
max_time = 5 # maximum number of seconds to spend on one configuration
148+
149+
##############################################
150+
# Data
151+
152+
153+
X = numpy.random.randn(N, n_features).astype(numpy.float32)
154+
noise = (numpy.random.randn(X.shape[0]) / (n_features // 5)).astype(numpy.float32)
155+
y = X.mean(axis=1) + noise
156+
n_train = min(N, N // 3)
157+
158+
159+
data = []
160+
couples = list(product(n_jobs, depth, n_ests))
161+
bar = tqdm(couples)
162+
cache_dir = "_cache"
163+
if not os.path.exists(cache_dir):
164+
os.mkdir(cache_dir)
165+
166+
for n_j, max_depth, n_estimators in bar:
167+
if n_j == 1 and n_estimators > n_ests[0]:
168+
# skipping
169+
continue
170+
171+
# parallelization
172+
cache_name = os.path.join(
173+
cache_dir, f"rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.pkl"
174+
)
175+
if os.path.exists(cache_name):
176+
with open(cache_name, "rb") as f:
177+
rf = pickle.load(f)
178+
else:
179+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} train rf")
180+
if n_j == 1 and issubclass(Regressor, RandomForestRegressor):
181+
rf = Regressor(max_depth=max_depth, n_estimators=n_estimators, n_jobs=-1)
182+
rf.fit(X[:n_train], y[:n_train])
183+
rf.n_jobs = 1
184+
else:
185+
rf = Regressor(max_depth=max_depth, n_estimators=n_estimators, n_jobs=n_j)
186+
rf.fit(X[:n_train], y[:n_train])
187+
with open(cache_name, "wb") as f:
188+
pickle.dump(rf, f)
189+
190+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} ISession")
191+
so = SessionOptions()
192+
so.intra_op_num_threads = n_j
193+
cache_name = os.path.join(
194+
cache_dir, f"rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.onnx"
195+
)
196+
if os.path.exists(cache_name):
197+
sess = InferenceSession(cache_name, so)
198+
else:
199+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} cvt onnx")
200+
onx = to_onnx(rf, X[:1])
201+
with open(cache_name, "wb") as f:
202+
f.write(onx.SerializeToString())
203+
sess = InferenceSession(cache_name, so)
204+
205+
# run once to avoid counting the first run
206+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predict1")
207+
rf.predict(X)
208+
sess.run(None, {"X": X})
209+
210+
# fixed data
211+
obs = dict(
212+
n_jobs=n_j,
213+
max_depth=max_depth,
214+
n_estimators=n_estimators,
215+
repeat=repeat,
216+
max_time=max_time,
217+
name=rf.__class__.__name__,
218+
n_rows=X.shape[0],
219+
n_features=X.shape[1],
220+
)
221+
222+
# baseline
223+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictB")
224+
r, t, mean, med = measure_inference(rf.predict, X, repeat=repeat, max_time=max_time)
225+
o1 = obs.copy()
226+
o1.update(dict(avg=mean, med=med, n_runs=r, ttime=t, name="base"))
227+
data.append(o1)
228+
229+
# baseline
230+
bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictO")
231+
r, t, mean, med = measure_inference(
232+
lambda x: sess.run(None, {"X": x}), X, repeat=repeat, max_time=max_time
233+
)
234+
o2 = obs.copy()
235+
o2.update(dict(avg=mean, med=med, n_runs=r, ttime=t, name="ort_"))
236+
data.append(o2)
237+
238+
239+
###################################################
240+
# Saving data
241+
# +++++++++++
242+
243+
name = "plot_beanchmark_rf"
244+
print(f"Saving data into {name!r}")
245+
246+
df = pandas.DataFrame(data)
247+
df2 = df.copy()
248+
df2["legend"] = legend
249+
df2.to_csv(f"{name}-{legend}.csv", index=False)
250+
251+
#######################################################
252+
# Printing the data
253+
print(df)
254+
255+
#####################################################
256+
# Plot
257+
# ++++
258+
259+
n_rows = len(n_jobs)
260+
n_cols = len(n_ests)
261+
262+
263+
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows))
264+
fig.suptitle(f"{rf.__class__.__name__}")
265+
266+
for n_j, n_estimators in tqdm(product(n_jobs, n_ests)):
267+
i = n_jobs.index(n_j)
268+
j = n_ests.index(n_estimators)
269+
ax = axes[i, j]
270+
271+
subdf = df[(df.n_estimators == n_estimators) & (df.n_jobs == n_j)]
272+
if subdf.shape[0] == 0:
273+
continue
274+
piv = subdf.pivot(index="max_depth", columns=["name"], values=["avg", "med"])
275+
piv.plot(ax=ax, title=f"jobs={n_j}, trees={n_estimators}")
276+
ax.set_ylabel(f"n_jobs={n_j}", fontsize="small")
277+
ax.set_xlabel("max_depth", fontsize="small")
278+
279+
# ratio
280+
print(subdf)
281+
ax2 = ax.twinx()
282+
piv1 = subdf.pivot(index="max_depth", columns="name", values="avg")
283+
piv1["speedup"] = piv1.base / piv1.ort_
284+
ax2.plot(piv1.index, piv1.speedup, "b--", label="speedup avg")
285+
286+
piv1 = subdf.pivot(index="max_depth", columns="name", values="med")
287+
piv1["speedup"] = piv1.base / piv1.ort_
288+
print(piv1)
289+
ax2.plot(piv1.index, piv1.speedup, "y--", label="speedup med")
290+
ax2.legend(fontsize="x-small")
291+
292+
for i in range(axes.shape[0]):
293+
for j in range(axes.shape[1]):
294+
axes[i, j].legend(fontsize="small")
295+
296+
fig.tight_layout()
297+
fig.savefig(f"{name}-{legend}.png")
298+
# plt.show()

_doc/tutorial/benchmarks.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
.. _l-benchmarks:
2+
3+
==========
4+
Benchmarks
5+
==========
6+
7+
A list of benchmark used to improve to the performance of
8+
ONNX components (onnx, onnxruntime, onnx-array-api, ...).
9+
10+
.. toctree::
11+
12+
../auto_examples/plot_benchmark_rf

_doc/tutorial/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ Tutorial
77
:maxdepth: 1
88

99
overview
10-
10+
benchmarks

0 commit comments

Comments
 (0)