Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .run/experiment (kind).run.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<configuration default="false" name="experiment (kind)" type="GoApplicationRunConfiguration" factoryName="Go Application">
<module name="kubernetes-controller-sharding" />
<working_directory value="$PROJECT_DIR$/webhosting-operator" />
<parameters value="reconcile" />
<parameters value="basic" />
<envs>
<env name="KUBECONFIG" value="$PROJECT_DIR$/hack/kind_kubeconfig.yaml" />
</envs>
Expand All @@ -12,4 +12,4 @@
<filePath value="$PROJECT_DIR$/webhosting-operator/cmd/experiment/main.go" />
<method v="2" />
</configuration>
</component>
</component>
4 changes: 2 additions & 2 deletions config/sharder/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ spec:
resources:
limits:
cpu: 200m
memory: 128Mi
memory: 512Mi
requests:
cpu: 100m
memory: 64Mi
memory: 256Mi
volumes:
- name: config
configMap:
Expand Down
21 changes: 19 additions & 2 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,10 @@ Usage:
experiment [command]

Available Scenarios
basic Basic load test scenario (15m) that creates roughly 9k websites
scale-out Scenario for testing scale-out with high churn rate
Available Scenarios
basic Basic load test, create 9k websites in 15 minutes
chaos Create 4.5k websites over 15 minutes and terminate a random shard every 5 minutes
scale-out Measure scale-out properties with a high churn rate
...
```

Expand Down Expand Up @@ -139,6 +141,21 @@ The scale of the controller setup is measured in two dimensions:
1. The number of API objects that the controller watches and reconciles.
2. The churn rate of API objects, i.e., the rate of object creations, updates, and deletions.

```yaml
queries:
- name: website-count # dimension 1
query: |
sum(kube_website_info)
- name: website-churn # dimension 2
query: |
sum(rate(
controller_runtime_reconcile_total{
job="experiment", result!="error",
controller=~"website-(generator|deleter|mutator)"
}[1m]
)) by (controller)
```

## SLIs / SLOs

To consider a controller setup as performing adequately, the following SLOs
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/sharder/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco

log.Info("Starting resync of object assignments for ControllerRing")
defer func(start time.Time) {
log.V(1).Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
log.Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start))
}(r.Clock.Now())

if err := o.ResyncControllerRing(ctx, log); err != nil {
Expand Down
9 changes: 9 additions & 0 deletions webhosting-operator/config/experiment/base/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ rules:
- list
- watch
- deletecollection
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
- delete
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down
14 changes: 14 additions & 0 deletions webhosting-operator/config/experiment/chaos/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../base

patches:
- target:
kind: Job
name: experiment
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: chaos
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 5,
"id": 35,
"links": [],
"panels": [
{
Expand Down Expand Up @@ -340,13 +340,25 @@
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)",
"legendFormat": "__auto",
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[1m]\n ))\n)",
"legendFormat": "{{run_id}}-1m",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[15m]\n ))\n)",
"hide": false,
"legendFormat": "{{run_id}}-15m",
"range": true,
"refId": "B"
}
],
"title": "Queue Latency (P99)",
"title": "Queue Latency (P$percentile)",
"type": "timeseries"
},
{
Expand Down Expand Up @@ -443,13 +455,25 @@
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)",
"legendFormat": "__auto",
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[1m]\n ))\n)",
"legendFormat": "{{run_id}}-1m",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[15m]\n ))\n)",
"hide": false,
"legendFormat": "{{run_id}}-15m",
"range": true,
"refId": "B"
}
],
"title": "Reconciliation Latency (P99)",
"title": "Reconciliation Latency (P$percentile)",
"type": "timeseries"
},
{
Expand Down Expand Up @@ -898,6 +922,33 @@
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "99",
"value": "99"
},
"label": "SLO Percentile",
"name": "percentile",
"options": [
{
"selected": false,
"text": "90",
"value": "90"
},
{
"selected": false,
"text": "95",
"value": "95"
},
{
"selected": true,
"text": "99",
"value": "99"
}
],
"query": "90,95,99",
"type": "custom"
}
]
},
Expand Down
19 changes: 17 additions & 2 deletions webhosting-operator/pkg/experiment/generator/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,33 @@ func (r *Every) AddToManager(mgr manager.Manager) error {
r.Client = mgr.GetClient()
}

initialDelay := time.Duration(0)
workers := defaultReconcileWorkers
if r.Workers > 0 {
workers = r.Workers
}

var rateLimiter workqueue.TypedRateLimiter[reconcile.Request] = &workqueue.TypedBucketRateLimiter[reconcile.Request]{
Limiter: rate.NewLimiter(r.Rate, int(r.Rate)),
}
if r.Rate < 1 {
// Special case for controllers running less frequent than every second:
// The token bucket rate limiter would not allow any events as burst is less than 1, so replace it with a custom
// rate limiter that always returns a constant delay.
// Also, delay the first request when starting the scenario.
every := time.Duration(1 / float64(r.Rate) * float64(time.Second))
rateLimiter = constantDelayRateLimiter(every)
initialDelay = every
workers = 1
}

return builder.ControllerManagedBy(mgr).
Named(r.Name).
WithOptions(controller.Options{
MaxConcurrentReconciles: workers,
RateLimiter: &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(r.Rate, int(r.Rate))},
RateLimiter: rateLimiter,
}).
WatchesRawSource(EmitN(workers)).
WatchesRawSource(EmitN(workers, initialDelay)).
Complete(StopOnContextCanceled(r))
}

Expand Down
21 changes: 15 additions & 6 deletions webhosting-operator/pkg/experiment/generator/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,21 @@ import (

var log = logf.Log

// EmitN returns a source that emits exactly n events (reconcile.Request). The source ignores predicates.
// EmitN returns a source that emits exactly n reconcile requests with the given delay.
// Use it with the controller builder:
//
// WatchesRawSource(EmitN(n), &handler.EnqueueRequestForObject{})
// WatchesRawSource(EmitN(n, time.Second))
//
// Or a plain controller:
//
// Watch(EmitN(n), &handler.EnqueueRequestForObject{})
func EmitN(n int) source.Source {
// Watch(EmitN(n, time.Second))
func EmitN(n int, delay time.Duration) source.Source {
return source.TypedFunc[reconcile.Request](func(ctx context.Context, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) error {
for i := 0; i < n; i++ {
queue.Add(reconcile.Request{NamespacedName: types.NamespacedName{
queue.AddAfter(reconcile.Request{NamespacedName: types.NamespacedName{
// use different object names, otherwise queue will merge the requests
Name: fmt.Sprintf("request-%d", n),
}})
}}, delay)
}

return nil
Expand Down Expand Up @@ -166,3 +166,12 @@ func CreateClusterScopedOwnerObject(ctx context.Context, c client.Client, opts .

return ownerObject, metav1.NewControllerRef(ownerObject, rbacv1.SchemeGroupVersion.WithKind("ClusterRole")), nil
}

var _ workqueue.TypedRateLimiter[reconcile.Request] = constantDelayRateLimiter(0)

// constantDelayRateLimiter delays all requests with a constant duration.
type constantDelayRateLimiter time.Duration

func (d constantDelayRateLimiter) When(reconcile.Request) time.Duration { return time.Duration(d) }
func (d constantDelayRateLimiter) Forget(reconcile.Request) {}
func (d constantDelayRateLimiter) NumRequeues(reconcile.Request) int { return 0 }
1 change: 1 addition & 0 deletions webhosting-operator/pkg/experiment/scenario/all/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ package all

import (
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/basic"
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/chaos"
_ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/scale-out"
)
2 changes: 1 addition & 1 deletion webhosting-operator/pkg/experiment/scenario/basic/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type scenario struct {
}

func (s *scenario) Description() string {
return "Basic load test scenario (15m) that creates roughly 9k websites"
return "Basic load test, create 9k websites in 15 minutes"
}

func (s *scenario) LongDescription() string {
Expand Down
Loading
Loading