test(perf): Aggregate snapshot latencies to average

roypat · roypat · commit 473161c24d9b · 2023-07-13T12:23:20.000Z
The statistical test used by firecracker performance tests is a
simplified t-test. This is based on the central limit theorem, and is
thus only valid for testing _averages_. Nevertheless, the snapshot tests
tried to use it for detecting deviations of P90/P50 values. This is
unsound, and the results are most definitely not what was intended. This
commit changes the aggregate collected from the snapshot tests to be an
average instead, to make the test sound.

Note that since we also emit the individual data points, we can always
reconstruct percentiles.

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_4.14.json b/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_4.14.json
@@ -4475,14 +4475,7 @@
         "latency": {
             "statistics": [
                 {
-                    "criteria": "EqualWith",
-                    "function": "Percentile50",
-                    "name": "P50"
-                },
-                {
-                    "criteria": "EqualWith",
-                    "function": "Percentile90",
-                    "name": "P90"
+                    "function": "Avg"
                 }
             ],
             "unit": "ms"
diff --git a/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_5.10.json b/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_5.10.json
@@ -4149,14 +4149,7 @@
         "latency": {
             "statistics": [
                 {
-                    "criteria": "EqualWith",
-                    "function": "Percentile50",
-                    "name": "P50"
-                },
-                {
-                    "criteria": "EqualWith",
-                    "function": "Percentile90",
-                    "name": "P90"
+                    "function": "Avg"
                 }
             ],
             "unit": "ms"
diff --git a/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_6.1.json b/tests/integration_tests/performance/configs/test_snapshot_restore_performance_config_6.1.json
@@ -4149,12 +4149,7 @@
         "latency": {
             "statistics": [
                 {
-                    "function": "Percentile50",
-                    "name": "P50"
-                },
-                {
-                    "function": "Percentile90",
-                    "name": "P90"
+                    "function": "Avg"
                 }
             ],
             "unit": "ms"
diff --git a/tools/parse_baselines/providers/snapshot_restore.py b/tools/parse_baselines/providers/snapshot_restore.py
@@ -26,22 +26,15 @@ def __init__(self, data_provider: Iterator):
         super().__init__(
             data_provider,
             [
-                "latency/P50",
-                "latency/P90",
+                "latency/Avg",
             ],
         )
 
     def calculate_baseline(self, data: List[float]) -> dict:
         """Return the target and delta values, given a list of data points."""
         avg = statistics.mean(data)
-        min_ = min(data)
-        max_ = max(data)
-
-        min_delta = 100 * abs(avg - min_) / avg
-        max_delta = 100 * abs(avg - max_) / avg
-        delta = max(max_delta, min_delta)
-
+        stddev = statistics.stdev(data)
         return {
-            "target": round(avg, 3),
-            "delta_percentage": math.ceil(delta) + DELTA_EXTRA_MARGIN,
+            "target": math.ceil(round(avg, 2)),
+            "delta_percentage": math.ceil(3 * stddev / avg * 100) + DELTA_EXTRA_MARGIN,
         }

Original file line number	Diff line number	Diff line change
`@@ -4149,12 +4149,7 @@`
`4149`	`4149`	`"latency": {`
`4150`	`4150`	`"statistics": [`
`4151`	`4151`	`{`
`4152`		`- "function": "Percentile50",`
`4153`		`- "name": "P50"`
`4154`		`- },`
`4155`		`- {`
`4156`		`- "function": "Percentile90",`
`4157`		`- "name": "P90"`
	`4152`	`+ "function": "Avg"`
`4158`	`4153`	`}`
`4159`	`4154`	`],`
`4160`	`4155`	`"unit": "ms"`