Skip to content

Commit 7b9b47f

Browse files
authored
Add chaos experiment scenario (#648)
* Prefactor: add optional delay to `EmitN` * Prefactor: `Every` supports rate < 1/s * Add `chaos` scenario * Document queries for controller load * Increase limits for sharder The memory usage is much higher during sharder resyncs now that they run with parallelism. * Add sliding SLO queries to experiment dashboard * Configurable SLO percentile in experiments dashboard
1 parent b475eb8 commit 7b9b47f

File tree

13 files changed

+278
-24
lines changed

13 files changed

+278
-24
lines changed

.run/experiment (kind).run.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<configuration default="false" name="experiment (kind)" type="GoApplicationRunConfiguration" factoryName="Go Application">
33
<module name="kubernetes-controller-sharding" />
44
<working_directory value="$PROJECT_DIR$/webhosting-operator" />
5-
<parameters value="reconcile" />
5+
<parameters value="basic" />
66
<envs>
77
<env name="KUBECONFIG" value="$PROJECT_DIR$/hack/kind_kubeconfig.yaml" />
88
</envs>
@@ -12,4 +12,4 @@
1212
<filePath value="$PROJECT_DIR$/webhosting-operator/cmd/experiment/main.go" />
1313
<method v="2" />
1414
</configuration>
15-
</component>
15+
</component>

config/sharder/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ spec:
5454
resources:
5555
limits:
5656
cpu: 200m
57-
memory: 128Mi
57+
memory: 512Mi
5858
requests:
5959
cpu: 100m
60-
memory: 64Mi
60+
memory: 256Mi
6161
volumes:
6262
- name: config
6363
configMap:

docs/evaluation.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ Usage:
7575
experiment [command]
7676
7777
Available Scenarios
78-
basic Basic load test scenario (15m) that creates roughly 9k websites
79-
scale-out Scenario for testing scale-out with high churn rate
78+
Available Scenarios
79+
basic Basic load test, create 9k websites in 15 minutes
80+
chaos Create 4.5k websites over 15 minutes and terminate a random shard every 5 minutes
81+
scale-out Measure scale-out properties with a high churn rate
8082
...
8183
```
8284

@@ -139,6 +141,21 @@ The scale of the controller setup is measured in two dimensions:
139141
1. The number of API objects that the controller watches and reconciles.
140142
2. The churn rate of API objects, i.e., the rate of object creations, updates, and deletions.
141143

144+
```yaml
145+
queries:
146+
- name: website-count # dimension 1
147+
query: |
148+
sum(kube_website_info)
149+
- name: website-churn # dimension 2
150+
query: |
151+
sum(rate(
152+
controller_runtime_reconcile_total{
153+
job="experiment", result!="error",
154+
controller=~"website-(generator|deleter|mutator)"
155+
}[1m]
156+
)) by (controller)
157+
```
158+
142159
## SLIs / SLOs
143160
144161
To consider a controller setup as performing adequately, the following SLOs

pkg/controller/sharder/reconciler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
8686

8787
log.Info("Starting resync of object assignments for ControllerRing")
8888
defer func(start time.Time) {
89-
log.V(1).Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
89+
log.Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
9090
}(r.Clock.Now())
9191

9292
if err := o.ResyncControllerRing(ctx, log); err != nil {

webhosting-operator/config/experiment/base/rbac.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ rules:
6161
- list
6262
- watch
6363
- deletecollection
64+
- apiGroups:
65+
- ""
66+
resources:
67+
- pods
68+
verbs:
69+
- get
70+
- list
71+
- watch
72+
- delete
6473
---
6574
apiVersion: rbac.authorization.k8s.io/v1
6675
kind: ClusterRoleBinding
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- ../base
6+
7+
patches:
8+
- target:
9+
kind: Job
10+
name: experiment
11+
patch: |
12+
- op: add
13+
path: /spec/template/spec/containers/0/args/-
14+
value: chaos

webhosting-operator/config/monitoring/default/dashboards/experiments.json

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"editable": true,
1919
"fiscalYearStartMonth": 0,
2020
"graphTooltip": 1,
21-
"id": 5,
21+
"id": 35,
2222
"links": [],
2323
"panels": [
2424
{
@@ -340,13 +340,25 @@
340340
"uid": "P1809F7CD0C75ACF3"
341341
},
342342
"editorMode": "code",
343-
"expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)",
344-
"legendFormat": "__auto",
343+
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[1m]\n ))\n)",
344+
"legendFormat": "{{run_id}}-1m",
345345
"range": true,
346346
"refId": "A"
347+
},
348+
{
349+
"datasource": {
350+
"type": "prometheus",
351+
"uid": "P1809F7CD0C75ACF3"
352+
},
353+
"editorMode": "code",
354+
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[15m]\n ))\n)",
355+
"hide": false,
356+
"legendFormat": "{{run_id}}-15m",
357+
"range": true,
358+
"refId": "B"
347359
}
348360
],
349-
"title": "Queue Latency (P99)",
361+
"title": "Queue Latency (P$percentile)",
350362
"type": "timeseries"
351363
},
352364
{
@@ -443,13 +455,25 @@
443455
"uid": "P1809F7CD0C75ACF3"
444456
},
445457
"editorMode": "code",
446-
"expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)",
447-
"legendFormat": "__auto",
458+
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[1m]\n ))\n)",
459+
"legendFormat": "{{run_id}}-1m",
448460
"range": true,
449461
"refId": "A"
462+
},
463+
{
464+
"datasource": {
465+
"type": "prometheus",
466+
"uid": "P1809F7CD0C75ACF3"
467+
},
468+
"editorMode": "code",
469+
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[15m]\n ))\n)",
470+
"hide": false,
471+
"legendFormat": "{{run_id}}-15m",
472+
"range": true,
473+
"refId": "B"
450474
}
451475
],
452-
"title": "Reconciliation Latency (P99)",
476+
"title": "Reconciliation Latency (P$percentile)",
453477
"type": "timeseries"
454478
},
455479
{
@@ -898,6 +922,33 @@
898922
"regex": "",
899923
"sort": 1,
900924
"type": "query"
925+
},
926+
{
927+
"current": {
928+
"text": "99",
929+
"value": "99"
930+
},
931+
"label": "SLO Percentile",
932+
"name": "percentile",
933+
"options": [
934+
{
935+
"selected": false,
936+
"text": "90",
937+
"value": "90"
938+
},
939+
{
940+
"selected": false,
941+
"text": "95",
942+
"value": "95"
943+
},
944+
{
945+
"selected": true,
946+
"text": "99",
947+
"value": "99"
948+
}
949+
],
950+
"query": "90,95,99",
951+
"type": "custom"
901952
}
902953
]
903954
},

webhosting-operator/pkg/experiment/generator/reconciler.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,33 @@ func (r *Every) AddToManager(mgr manager.Manager) error {
5454
r.Client = mgr.GetClient()
5555
}
5656

57+
initialDelay := time.Duration(0)
5758
workers := defaultReconcileWorkers
5859
if r.Workers > 0 {
5960
workers = r.Workers
6061
}
6162

63+
var rateLimiter workqueue.TypedRateLimiter[reconcile.Request] = &workqueue.TypedBucketRateLimiter[reconcile.Request]{
64+
Limiter: rate.NewLimiter(r.Rate, int(r.Rate)),
65+
}
66+
if r.Rate < 1 {
67+
// Special case for controllers running less frequent than every second:
68+
// The token bucket rate limiter would not allow any events as burst is less than 1, so replace it with a custom
69+
// rate limiter that always returns a constant delay.
70+
// Also, delay the first request when starting the scenario.
71+
every := time.Duration(1 / float64(r.Rate) * float64(time.Second))
72+
rateLimiter = constantDelayRateLimiter(every)
73+
initialDelay = every
74+
workers = 1
75+
}
76+
6277
return builder.ControllerManagedBy(mgr).
6378
Named(r.Name).
6479
WithOptions(controller.Options{
6580
MaxConcurrentReconciles: workers,
66-
RateLimiter: &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(r.Rate, int(r.Rate))},
81+
RateLimiter: rateLimiter,
6782
}).
68-
WatchesRawSource(EmitN(workers)).
83+
WatchesRawSource(EmitN(workers, initialDelay)).
6984
Complete(StopOnContextCanceled(r))
7085
}
7186

webhosting-operator/pkg/experiment/generator/utils.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,21 +37,21 @@ import (
3737

3838
var log = logf.Log
3939

40-
// EmitN returns a source that emits exactly n events (reconcile.Request). The source ignores predicates.
40+
// EmitN returns a source that emits exactly n reconcile requests with the given delay.
4141
// Use it with the controller builder:
4242
//
43-
// WatchesRawSource(EmitN(n), &handler.EnqueueRequestForObject{})
43+
// WatchesRawSource(EmitN(n, time.Second))
4444
//
4545
// Or a plain controller:
4646
//
47-
// Watch(EmitN(n), &handler.EnqueueRequestForObject{})
48-
func EmitN(n int) source.Source {
47+
// Watch(EmitN(n, time.Second))
48+
func EmitN(n int, delay time.Duration) source.Source {
4949
return source.TypedFunc[reconcile.Request](func(ctx context.Context, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) error {
5050
for i := 0; i < n; i++ {
51-
queue.Add(reconcile.Request{NamespacedName: types.NamespacedName{
51+
queue.AddAfter(reconcile.Request{NamespacedName: types.NamespacedName{
5252
// use different object names, otherwise queue will merge the requests
5353
Name: fmt.Sprintf("request-%d", n),
54-
}})
54+
}}, delay)
5555
}
5656

5757
return nil
@@ -166,3 +166,12 @@ func CreateClusterScopedOwnerObject(ctx context.Context, c client.Client, opts .
166166

167167
return ownerObject, metav1.NewControllerRef(ownerObject, rbacv1.SchemeGroupVersion.WithKind("ClusterRole")), nil
168168
}
169+
170+
var _ workqueue.TypedRateLimiter[reconcile.Request] = constantDelayRateLimiter(0)
171+
172+
// constantDelayRateLimiter delays all requests with a constant duration.
173+
type constantDelayRateLimiter time.Duration
174+
175+
func (d constantDelayRateLimiter) When(reconcile.Request) time.Duration { return time.Duration(d) }
176+
func (d constantDelayRateLimiter) Forget(reconcile.Request) {}
177+
func (d constantDelayRateLimiter) NumRequeues(reconcile.Request) int { return 0 }

webhosting-operator/pkg/experiment/scenario/all/all.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ package all
1919

2020
import (
2121
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/basic"
22+
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/chaos"
2223
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/scale-out"
2324
)

0 commit comments

Comments
 (0)