Add chaos experiment scenario (#648)

timebertt · web-flow · commit 7b9b47fb341b · 2025-08-24T17:04:02.000Z
* Prefactor: add optional delay to `EmitN`

* Prefactor: `Every` supports rate &lt; 1/s

* Add `chaos` scenario

* Document queries for controller load

* Increase limits for sharder

The memory usage is much higher during sharder resyncs now that they run with parallelism.

* Add sliding SLO queries to experiment dashboard

* Configurable SLO percentile in experiments dashboard
diff --git a/.run/experiment (kind).run.xml b/.run/experiment (kind).run.xml
@@ -2,7 +2,7 @@
   <configuration default="false" name="experiment (kind)" type="GoApplicationRunConfiguration" factoryName="Go Application">
     <module name="kubernetes-controller-sharding" />
     <working_directory value="$PROJECT_DIR$/webhosting-operator" />
-    <parameters value="reconcile" />
+    <parameters value="basic" />
     <envs>
       <env name="KUBECONFIG" value="$PROJECT_DIR$/hack/kind_kubeconfig.yaml" />
     </envs>
@@ -12,4 +12,4 @@
     <filePath value="$PROJECT_DIR$/webhosting-operator/cmd/experiment/main.go" />
     <method v="2" />
   </configuration>
-</component>
+</component>
diff --git a/config/sharder/deployment.yaml b/config/sharder/deployment.yaml
@@ -54,10 +54,10 @@ spec:
         resources:
           limits:
             cpu: 200m
-            memory: 128Mi
+            memory: 512Mi
           requests:
             cpu: 100m
-            memory: 64Mi
+            memory: 256Mi
       volumes:
       - name: config
         configMap:
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -75,8 +75,10 @@ Usage:
   experiment [command]
 
 Available Scenarios
-  basic       Basic load test scenario (15m) that creates roughly 9k websites
-  scale-out   Scenario for testing scale-out with high churn rate
+Available Scenarios
+  basic       Basic load test, create 9k websites in 15 minutes
+  chaos       Create 4.5k websites over 15 minutes and terminate a random shard every 5 minutes
+  scale-out   Measure scale-out properties with a high churn rate
 ...
 ```
 
@@ -139,6 +141,21 @@ The scale of the controller setup is measured in two dimensions:
 1. The number of API objects that the controller watches and reconciles.
 2. The churn rate of API objects, i.e., the rate of object creations, updates, and deletions.
 
+```yaml
+queries:
+- name: website-count # dimension 1
+  query: |
+    sum(kube_website_info)
+- name: website-churn # dimension 2
+  query: |
+    sum(rate(
+      controller_runtime_reconcile_total{
+        job="experiment", result!="error",
+        controller=~"website-(generator|deleter|mutator)"
+      }[1m]
+    )) by (controller)
+```
+
 ## SLIs / SLOs
 
 To consider a controller setup as performing adequately, the following SLOs
diff --git a/pkg/controller/sharder/reconciler.go b/pkg/controller/sharder/reconciler.go
@@ -86,7 +86,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 
 	log.Info("Starting resync of object assignments for ControllerRing")
 	defer func(start time.Time) {
-		log.V(1).Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
+		log.Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
 	}(r.Clock.Now())
 
 	if err := o.ResyncControllerRing(ctx, log); err != nil {
diff --git a/webhosting-operator/config/experiment/base/rbac.yaml b/webhosting-operator/config/experiment/base/rbac.yaml
@@ -61,6 +61,15 @@ rules:
   - list
   - watch
   - deletecollection
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
+  - delete
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/webhosting-operator/config/experiment/chaos/kustomization.yaml b/webhosting-operator/config/experiment/chaos/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../base
+
+patches:
+- target:
+    kind: Job
+    name: experiment
+  patch: |
+    - op: add
+      path: /spec/template/spec/containers/0/args/-
+      value: chaos
diff --git a/webhosting-operator/config/monitoring/default/dashboards/experiments.json b/webhosting-operator/config/monitoring/default/dashboards/experiments.json
@@ -18,7 +18,7 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 1,
-  "id": 5,
+  "id": 35,
   "links": [],
   "panels": [
     {
@@ -340,13 +340,25 @@
             "uid": "P1809F7CD0C75ACF3"
           },
           "editorMode": "code",
-          "expr": "histogram_quantile(0.99,\n    sum by (run_id, le) (rate(\n        workqueue_queue_duration_seconds_bucket{\n            job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n        }[$__rate_interval]\n    ))\n)",
-          "legendFormat": "__auto",
+          "expr": "histogram_quantile($percentile/100,\n    sum by (run_id, le) (rate(\n        workqueue_queue_duration_seconds_bucket{\n            job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n        }[1m]\n    ))\n)",
+          "legendFormat": "{{run_id}}-1m",
           "range": true,
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "P1809F7CD0C75ACF3"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile($percentile/100,\n    sum by (run_id, le) (rate(\n        workqueue_queue_duration_seconds_bucket{\n            job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n        }[15m]\n    ))\n)",
+          "hide": false,
+          "legendFormat": "{{run_id}}-15m",
+          "range": true,
+          "refId": "B"
         }
       ],
-      "title": "Queue Latency (P99)",
+      "title": "Queue Latency (P$percentile)",
       "type": "timeseries"
     },
     {
@@ -443,13 +455,25 @@
             "uid": "P1809F7CD0C75ACF3"
           },
           "editorMode": "code",
-          "expr": "histogram_quantile(0.99,\n    sum by (run_id, le) (rate(\n        experiment_website_reconciliation_duration_seconds_bucket{\n            job=\"experiment\", run_id=~\"$run_id\"\n        }[$__rate_interval]\n    ))\n)",
-          "legendFormat": "__auto",
+          "expr": "histogram_quantile($percentile/100,\n    sum by (run_id, le) (rate(\n        experiment_website_reconciliation_duration_seconds_bucket{\n            job=\"experiment\", run_id=~\"$run_id\"\n        }[1m]\n    ))\n)",
+          "legendFormat": "{{run_id}}-1m",
           "range": true,
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "P1809F7CD0C75ACF3"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile($percentile/100,\n    sum by (run_id, le) (rate(\n        experiment_website_reconciliation_duration_seconds_bucket{\n            job=\"experiment\", run_id=~\"$run_id\"\n        }[15m]\n    ))\n)",
+          "hide": false,
+          "legendFormat": "{{run_id}}-15m",
+          "range": true,
+          "refId": "B"
         }
       ],
-      "title": "Reconciliation Latency (P99)",
+      "title": "Reconciliation Latency (P$percentile)",
       "type": "timeseries"
     },
     {
@@ -898,6 +922,33 @@
         "regex": "",
         "sort": 1,
         "type": "query"
+      },
+      {
+        "current": {
+          "text": "99",
+          "value": "99"
+        },
+        "label": "SLO Percentile",
+        "name": "percentile",
+        "options": [
+          {
+            "selected": false,
+            "text": "90",
+            "value": "90"
+          },
+          {
+            "selected": false,
+            "text": "95",
+            "value": "95"
+          },
+          {
+            "selected": true,
+            "text": "99",
+            "value": "99"
+          }
+        ],
+        "query": "90,95,99",
+        "type": "custom"
       }
     ]
   },
diff --git a/webhosting-operator/pkg/experiment/generator/reconciler.go b/webhosting-operator/pkg/experiment/generator/reconciler.go
@@ -54,18 +54,33 @@ func (r *Every) AddToManager(mgr manager.Manager) error {
 		r.Client = mgr.GetClient()
 	}
 
+	initialDelay := time.Duration(0)
 	workers := defaultReconcileWorkers
 	if r.Workers > 0 {
 		workers = r.Workers
 	}
 
+	var rateLimiter workqueue.TypedRateLimiter[reconcile.Request] = &workqueue.TypedBucketRateLimiter[reconcile.Request]{
+		Limiter: rate.NewLimiter(r.Rate, int(r.Rate)),
+	}
+	if r.Rate < 1 {
+		// Special case for controllers running less frequent than every second:
+		// The token bucket rate limiter would not allow any events as burst is less than 1, so replace it with a custom
+		// rate limiter that always returns a constant delay.
+		// Also, delay the first request when starting the scenario.
+		every := time.Duration(1 / float64(r.Rate) * float64(time.Second))
+		rateLimiter = constantDelayRateLimiter(every)
+		initialDelay = every
+		workers = 1
+	}
+
 	return builder.ControllerManagedBy(mgr).
 		Named(r.Name).
 		WithOptions(controller.Options{
 			MaxConcurrentReconciles: workers,
-			RateLimiter:             &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(r.Rate, int(r.Rate))},
+			RateLimiter:             rateLimiter,
 		}).
-		WatchesRawSource(EmitN(workers)).
+		WatchesRawSource(EmitN(workers, initialDelay)).
 		Complete(StopOnContextCanceled(r))
 }
 
diff --git a/webhosting-operator/pkg/experiment/generator/utils.go b/webhosting-operator/pkg/experiment/generator/utils.go
@@ -37,21 +37,21 @@ import (
 
 var log = logf.Log
 
-// EmitN returns a source that emits exactly n events (reconcile.Request). The source ignores predicates.
+// EmitN returns a source that emits exactly n reconcile requests with the given delay.
 // Use it with the controller builder:
 //
-//	WatchesRawSource(EmitN(n), &handler.EnqueueRequestForObject{})
+//	WatchesRawSource(EmitN(n, time.Second))
 //
 // Or a plain controller:
 //
-//	Watch(EmitN(n), &handler.EnqueueRequestForObject{})
-func EmitN(n int) source.Source {
+//	Watch(EmitN(n, time.Second))
+func EmitN(n int, delay time.Duration) source.Source {
 	return source.TypedFunc[reconcile.Request](func(ctx context.Context, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) error {
 		for i := 0; i < n; i++ {
-			queue.Add(reconcile.Request{NamespacedName: types.NamespacedName{
+			queue.AddAfter(reconcile.Request{NamespacedName: types.NamespacedName{
 				// use different object names, otherwise queue will merge the requests
 				Name: fmt.Sprintf("request-%d", n),
-			}})
+			}}, delay)
 		}
 
 		return nil
@@ -166,3 +166,12 @@ func CreateClusterScopedOwnerObject(ctx context.Context, c client.Client, opts .
 
 	return ownerObject, metav1.NewControllerRef(ownerObject, rbacv1.SchemeGroupVersion.WithKind("ClusterRole")), nil
 }
+
+var _ workqueue.TypedRateLimiter[reconcile.Request] = constantDelayRateLimiter(0)
+
+// constantDelayRateLimiter delays all requests with a constant duration.
+type constantDelayRateLimiter time.Duration
+
+func (d constantDelayRateLimiter) When(reconcile.Request) time.Duration { return time.Duration(d) }
+func (d constantDelayRateLimiter) Forget(reconcile.Request)             {}
+func (d constantDelayRateLimiter) NumRequeues(reconcile.Request) int    { return 0 }
diff --git a/webhosting-operator/pkg/experiment/scenario/all/all.go b/webhosting-operator/pkg/experiment/scenario/all/all.go
@@ -19,5 +19,6 @@ package all
 
 import (
 	_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/basic"
+	_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/chaos"
 	_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/scale-out"
 )
diff --git a/webhosting-operator/pkg/experiment/scenario/basic/basic.go b/webhosting-operator/pkg/experiment/scenario/basic/basic.go
@@ -47,7 +47,7 @@ type scenario struct {
 }
 
 func (s *scenario) Description() string {
-	return "Basic load test scenario (15m) that creates roughly 9k websites"
+	return "Basic load test, create 9k websites in 15 minutes"
 }
 
 func (s *scenario) LongDescription() string {
diff --git a/webhosting-operator/pkg/experiment/scenario/chaos/chaos.go b/webhosting-operator/pkg/experiment/scenario/chaos/chaos.go
diff --git a/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go b/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go

Original file line number	Diff line number	Diff line change
`@@ -19,5 +19,6 @@ package all`
`19`	`19`
`20`	`20`	`import (`
`21`	`21`	`_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/basic"`
	`22`	`+ _ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/chaos"`
`22`	`23`	`_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/scale-out"`
`23`	`24`	`)`