router: make the benchmark and the router more adaptive to core count (scionproto#4456)

jiceathome · juagargi · commit 3e1c623d36b6 · 2024-03-08T18:56:55.000+01:00
In the router. Compute a better default for the number of processors based on multiple observations: given enough streams to occupy all processors, performance is degraded if there are as many processors as cores. Best performance is obtained by setting aside two cores for other tasks. Making this the default simplifies the benchmarking process by not having to configure an override for each type of target machine being tested.

In the benchmark. Run the test with many streams. It reflects reality and allows a very parallel router to perform at its best without distorting the performance of a less-parallel one. Make sure that the number of streams is likely to be a multiple of the number of cores so the cores are loaded evenly in spite of there being typically fewer streams than in real life. (TBD, learn the router's internal processors count and just use that).

As a side effect:
* Adjust CI's expectations for the benchmark
* Add automatic retries (2) for failed CI tests.
* Allow retrying successful CI tests (helps investigating performance variations).
* Add cpu_info to the router_benchmark output (same reason).
diff --git a/.buildkite/pipeline_lib.sh b/.buildkite/pipeline_lib.sh
@@ -53,8 +53,12 @@ gen_bazel_test_steps() {
         echo "          - \"bazel-testlogs.tar.gz\""
         echo "        timeout_in_minutes: 20"
         echo "        retry:"
+        echo "          manual:"
+        echo "            permit_on_passed: true"
         echo "          automatic:"
         echo "            - exit_status: -1 # Agent was lost"
         echo "            - exit_status: 255 # Forced agent shutdown"
+        echo "            - exit_status: 3 # Test may be flaky or it just didn't pass"
+        echo "              limit: 2"
     done
 }
diff --git a/acceptance/router_benchmark/test.py b/acceptance/router_benchmark/test.py
@@ -20,7 +20,7 @@
 
 from collections import namedtuple
 from plumbum import cli
-from plumbum.cmd import docker, whoami
+from plumbum.cmd import docker, whoami, cat
 from plumbum import cmd
 
 from acceptance.common import base
@@ -34,11 +34,11 @@
 
 # Those values are valid expectations only when running in the CI environment.
 TEST_CASES = {
-    'in': 270000,
-    'out': 240000,
-    'in_transit': 270000,
-    'out_transit': 240000,
-    'br_transit': 260000,
+    'in': 290000,
+    'out': 290000,
+    'in_transit': 250000,
+    'out_transit': 250000,
+    'br_transit': 290000,
 }
 
 
@@ -60,6 +60,12 @@ def mac_for_ip(ip: str) -> str:
     return 'f0:0d:ca:fe:{:02x}:{:02x}'.format(int(ipBytes[2]), int(ipBytes[3]))
 
 
+# Dump the cpu info into the log just to inform our thoughts on performance variability.
+def log_cpu_info():
+    cpu_info = cat('/proc/cpuinfo')
+    logger.info(f"CPU INFO BEGINS\n{cpu_info}\nCPU_INFO ENDS")
+
+
 class RouterBMTest(base.TestBase):
     """
     Tests that the implementation of a router has sufficient performance in terms of packets
@@ -148,6 +154,9 @@ def create_interface(self, req: IntfReq, ns: str):
     def setup_prepare(self):
         super().setup_prepare()
 
+        # As the name inplies.
+        log_cpu_info()
+
         # get the config where the router can find it.
         shutil.copytree("acceptance/router_benchmark/conf/", self.artifacts / "conf")
 
@@ -221,13 +230,16 @@ def teardown(self):
 
     def execBrLoad(self, case: str, mapArgs: list[str], count: int) -> str:
         brload = self.get_executable("brload")
+        # For num-streams, attempt to distribute uniformly on many possible number of cores.
+        # 840 is a multiple of 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 15, 20, 21, 24, 28, ...
+        # We can't go too far down that road; the test has to build one packet for each stream.
         return sudo(brload.executable,
                     "run",
                     "--artifacts", self.artifacts,
                     *mapArgs,
                     "--case", case,
                     "--num-packets", str(count),
-                    "--num-streams", "2")
+                    "--num-streams", "840")
 
     def runTestCase(self, case: str, mapArgs: list[str]):
         logger.info(f"==> Starting load {case}")
@@ -242,9 +254,8 @@ def runTestCase(self, case: str, mapArgs: list[str]):
 
         # The raw metrics are expressed in terms of core*seconds. We convert to machine*seconds
         # which allows us to provide a projected packet/s; ...more intuitive than packets/core*s.
-        # We're interested only in br_transit traffic. We measure the rate over 10s. For best
-        # results we sample the end of the middle 10s of the run. "beg" is the start time of the
-        # real action and "end" is the end time.
+        # We measure the rate over 10s. For best results we sample the end of the middle 10s of the
+        # run. "beg" is the start time of the real action and "end" is the end time.
         sampleTime = (int(beg) + int(end) + 10) / 2
         promQuery = urlencode({
             'time': f'{sampleTime}',
@@ -300,9 +311,9 @@ def _run(self):
         for label, intf in self.intfMap.items():
             mapArgs.extend(["--interface", f"{label}={intf.name},{intf.mac},{intf.peerMac}"])
 
-        # Run one (10% size) test as warm-up to trigger the frequency scaling, else the first test
+        # Run one (30% size) test as warm-up to trigger the frequency scaling, else the first test
         # gets much lower performance.
-        self.execBrLoad(list(TEST_CASES)[0], mapArgs, 1000000)
+        self.execBrLoad(list(TEST_CASES)[0], mapArgs, 3000000)
 
         # At long last, run the tests
         rateMap = {}
diff --git a/router/config/config.go b/router/config/config.go
@@ -74,11 +74,24 @@ func (cfg *RouterConfig) Validate() error {
 func (cfg *RouterConfig) InitDefaults() {
 
 	// NumProcessors is the number of goroutines used to handle the processing queue.
-	// By default, there are as many as cores allowed by Go and other goroutines displace
-	// the packet processors sporadically. It may be either good or bad to create more
-	// processors (plus the other goroutines) than there are cores... experience will tell.
+	// It has been observed that allowing the packet processors starve the other tasks was
+	// counterproductive. We get much better results by setting two cores aside for other go
+	// routines (such as for input and output). It remains to be seen if even more cores need to be
+	// set aside on large core-count systems.
+
 	if cfg.NumProcessors == 0 {
-		cfg.NumProcessors = runtime.GOMAXPROCS(0)
+		// Do what we think is best, so in most cases there's no need for an explicit config.
+		maxProcs := runtime.GOMAXPROCS(0)
+		if maxProcs > 3 {
+			// Two for I/Os, two or more for processing.
+			cfg.NumProcessors = maxProcs - 2
+		} else if maxProcs > 1 {
+			// I/Os <= processing.
+			cfg.NumProcessors = maxProcs - 1
+		} else {
+			// No choice.
+			cfg.NumProcessors = maxProcs
+		}
 	}
 
 	if cfg.NumSlowPathProcessors == 0 {