Skip to content

Commit 3e1c623

Browse files
jiceathomejuagargi
authored andcommitted
router: make the benchmark and the router more adaptive to core count (scionproto#4456)
In the router. Compute a better default for the number of processors based on multiple observations: given enough streams to occupy all processors, performance is degraded if there are as many processors as cores. Best performance is obtained by setting aside two cores for other tasks. Making this the default simplifies the benchmarking process by not having to configure an override for each type of target machine being tested. In the benchmark. Run the test with many streams. It reflects reality and allows a very parallel router to perform at its best without distorting the performance of a less-parallel one. Make sure that the number of streams is likely to be a multiple of the number of cores so the cores are loaded evenly in spite of there being typically fewer streams than in real life. (TBD, learn the router's internal processors count and just use that). As a side effect: * Adjust CI's expectations for the benchmark * Add automatic retries (2) for failed CI tests. * Allow retrying successful CI tests (helps investigating performance variations). * Add cpu_info to the router_benchmark output (same reason).
1 parent bb483d9 commit 3e1c623

File tree

3 files changed

+44
-16
lines changed

3 files changed

+44
-16
lines changed

.buildkite/pipeline_lib.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,12 @@ gen_bazel_test_steps() {
5353
echo " - \"bazel-testlogs.tar.gz\""
5454
echo " timeout_in_minutes: 20"
5555
echo " retry:"
56+
echo " manual:"
57+
echo " permit_on_passed: true"
5658
echo " automatic:"
5759
echo " - exit_status: -1 # Agent was lost"
5860
echo " - exit_status: 255 # Forced agent shutdown"
61+
echo " - exit_status: 3 # Test may be flaky or it just didn't pass"
62+
echo " limit: 2"
5963
done
6064
}

acceptance/router_benchmark/test.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from collections import namedtuple
2222
from plumbum import cli
23-
from plumbum.cmd import docker, whoami
23+
from plumbum.cmd import docker, whoami, cat
2424
from plumbum import cmd
2525

2626
from acceptance.common import base
@@ -34,11 +34,11 @@
3434

3535
# Those values are valid expectations only when running in the CI environment.
3636
TEST_CASES = {
37-
'in': 270000,
38-
'out': 240000,
39-
'in_transit': 270000,
40-
'out_transit': 240000,
41-
'br_transit': 260000,
37+
'in': 290000,
38+
'out': 290000,
39+
'in_transit': 250000,
40+
'out_transit': 250000,
41+
'br_transit': 290000,
4242
}
4343

4444

@@ -60,6 +60,12 @@ def mac_for_ip(ip: str) -> str:
6060
return 'f0:0d:ca:fe:{:02x}:{:02x}'.format(int(ipBytes[2]), int(ipBytes[3]))
6161

6262

63+
# Dump the cpu info into the log just to inform our thoughts on performance variability.
64+
def log_cpu_info():
65+
cpu_info = cat('/proc/cpuinfo')
66+
logger.info(f"CPU INFO BEGINS\n{cpu_info}\nCPU_INFO ENDS")
67+
68+
6369
class RouterBMTest(base.TestBase):
6470
"""
6571
Tests that the implementation of a router has sufficient performance in terms of packets
@@ -148,6 +154,9 @@ def create_interface(self, req: IntfReq, ns: str):
148154
def setup_prepare(self):
149155
super().setup_prepare()
150156

157+
# As the name inplies.
158+
log_cpu_info()
159+
151160
# get the config where the router can find it.
152161
shutil.copytree("acceptance/router_benchmark/conf/", self.artifacts / "conf")
153162

@@ -221,13 +230,16 @@ def teardown(self):
221230

222231
def execBrLoad(self, case: str, mapArgs: list[str], count: int) -> str:
223232
brload = self.get_executable("brload")
233+
# For num-streams, attempt to distribute uniformly on many possible number of cores.
234+
# 840 is a multiple of 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 15, 20, 21, 24, 28, ...
235+
# We can't go too far down that road; the test has to build one packet for each stream.
224236
return sudo(brload.executable,
225237
"run",
226238
"--artifacts", self.artifacts,
227239
*mapArgs,
228240
"--case", case,
229241
"--num-packets", str(count),
230-
"--num-streams", "2")
242+
"--num-streams", "840")
231243

232244
def runTestCase(self, case: str, mapArgs: list[str]):
233245
logger.info(f"==> Starting load {case}")
@@ -242,9 +254,8 @@ def runTestCase(self, case: str, mapArgs: list[str]):
242254

243255
# The raw metrics are expressed in terms of core*seconds. We convert to machine*seconds
244256
# which allows us to provide a projected packet/s; ...more intuitive than packets/core*s.
245-
# We're interested only in br_transit traffic. We measure the rate over 10s. For best
246-
# results we sample the end of the middle 10s of the run. "beg" is the start time of the
247-
# real action and "end" is the end time.
257+
# We measure the rate over 10s. For best results we sample the end of the middle 10s of the
258+
# run. "beg" is the start time of the real action and "end" is the end time.
248259
sampleTime = (int(beg) + int(end) + 10) / 2
249260
promQuery = urlencode({
250261
'time': f'{sampleTime}',
@@ -300,9 +311,9 @@ def _run(self):
300311
for label, intf in self.intfMap.items():
301312
mapArgs.extend(["--interface", f"{label}={intf.name},{intf.mac},{intf.peerMac}"])
302313

303-
# Run one (10% size) test as warm-up to trigger the frequency scaling, else the first test
314+
# Run one (30% size) test as warm-up to trigger the frequency scaling, else the first test
304315
# gets much lower performance.
305-
self.execBrLoad(list(TEST_CASES)[0], mapArgs, 1000000)
316+
self.execBrLoad(list(TEST_CASES)[0], mapArgs, 3000000)
306317

307318
# At long last, run the tests
308319
rateMap = {}

router/config/config.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,24 @@ func (cfg *RouterConfig) Validate() error {
7474
func (cfg *RouterConfig) InitDefaults() {
7575

7676
// NumProcessors is the number of goroutines used to handle the processing queue.
77-
// By default, there are as many as cores allowed by Go and other goroutines displace
78-
// the packet processors sporadically. It may be either good or bad to create more
79-
// processors (plus the other goroutines) than there are cores... experience will tell.
77+
// It has been observed that allowing the packet processors starve the other tasks was
78+
// counterproductive. We get much better results by setting two cores aside for other go
79+
// routines (such as for input and output). It remains to be seen if even more cores need to be
80+
// set aside on large core-count systems.
81+
8082
if cfg.NumProcessors == 0 {
81-
cfg.NumProcessors = runtime.GOMAXPROCS(0)
83+
// Do what we think is best, so in most cases there's no need for an explicit config.
84+
maxProcs := runtime.GOMAXPROCS(0)
85+
if maxProcs > 3 {
86+
// Two for I/Os, two or more for processing.
87+
cfg.NumProcessors = maxProcs - 2
88+
} else if maxProcs > 1 {
89+
// I/Os <= processing.
90+
cfg.NumProcessors = maxProcs - 1
91+
} else {
92+
// No choice.
93+
cfg.NumProcessors = maxProcs
94+
}
8295
}
8396

8497
if cfg.NumSlowPathProcessors == 0 {

0 commit comments

Comments
 (0)