diff --git a/.run/experiment (kind).run.xml b/.run/experiment (kind).run.xml index 7288f8bf..1619cd32 100644 --- a/.run/experiment (kind).run.xml +++ b/.run/experiment (kind).run.xml @@ -2,7 +2,7 @@ - + @@ -12,4 +12,4 @@ - + \ No newline at end of file diff --git a/config/sharder/deployment.yaml b/config/sharder/deployment.yaml index 757bd345..8a474fa8 100644 --- a/config/sharder/deployment.yaml +++ b/config/sharder/deployment.yaml @@ -54,10 +54,10 @@ spec: resources: limits: cpu: 200m - memory: 128Mi + memory: 512Mi requests: cpu: 100m - memory: 64Mi + memory: 256Mi volumes: - name: config configMap: diff --git a/docs/evaluation.md b/docs/evaluation.md index 5d5a0738..ba0ba925 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -75,8 +75,10 @@ Usage: experiment [command] Available Scenarios - basic Basic load test scenario (15m) that creates roughly 9k websites - scale-out Scenario for testing scale-out with high churn rate +Available Scenarios + basic Basic load test, create 9k websites in 15 minutes + chaos Create 4.5k websites over 15 minutes and terminate a random shard every 5 minutes + scale-out Measure scale-out properties with a high churn rate ... ``` @@ -139,6 +141,21 @@ The scale of the controller setup is measured in two dimensions: 1. The number of API objects that the controller watches and reconciles. 2. The churn rate of API objects, i.e., the rate of object creations, updates, and deletions. +```yaml +queries: +- name: website-count # dimension 1 + query: | + sum(kube_website_info) +- name: website-churn # dimension 2 + query: | + sum(rate( + controller_runtime_reconcile_total{ + job="experiment", result!="error", + controller=~"website-(generator|deleter|mutator)" + }[1m] + )) by (controller) +``` + ## SLIs / SLOs To consider a controller setup as performing adequately, the following SLOs diff --git a/pkg/controller/sharder/reconciler.go b/pkg/controller/sharder/reconciler.go index 6511442e..e6703334 100644 --- a/pkg/controller/sharder/reconciler.go +++ b/pkg/controller/sharder/reconciler.go @@ -86,7 +86,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco log.Info("Starting resync of object assignments for ControllerRing") defer func(start time.Time) { - log.V(1).Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start)) + log.Info("Finished resync of object assignments for ControllerRing", "duration", r.Clock.Since(start)) }(r.Clock.Now()) if err := o.ResyncControllerRing(ctx, log); err != nil { diff --git a/webhosting-operator/config/experiment/base/rbac.yaml b/webhosting-operator/config/experiment/base/rbac.yaml index 0c8c6454..6c24f695 100644 --- a/webhosting-operator/config/experiment/base/rbac.yaml +++ b/webhosting-operator/config/experiment/base/rbac.yaml @@ -61,6 +61,15 @@ rules: - list - watch - deletecollection +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - delete --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/webhosting-operator/config/experiment/chaos/kustomization.yaml b/webhosting-operator/config/experiment/chaos/kustomization.yaml new file mode 100644 index 00000000..d82bf094 --- /dev/null +++ b/webhosting-operator/config/experiment/chaos/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../base + +patches: +- target: + kind: Job + name: experiment + patch: | + - op: add + path: /spec/template/spec/containers/0/args/- + value: chaos diff --git a/webhosting-operator/config/monitoring/default/dashboards/experiments.json b/webhosting-operator/config/monitoring/default/dashboards/experiments.json index 17bfb048..29d99eaf 100644 --- a/webhosting-operator/config/monitoring/default/dashboards/experiments.json +++ b/webhosting-operator/config/monitoring/default/dashboards/experiments.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 5, + "id": 35, "links": [], "panels": [ { @@ -340,13 +340,25 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", - "expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)", - "legendFormat": "__auto", + "expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[1m]\n ))\n)", + "legendFormat": "{{run_id}}-1m", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n workqueue_queue_duration_seconds_bucket{\n job=\"webhosting-operator\", name=\"website\", run_id=~\"$run_id\"\n }[15m]\n ))\n)", + "hide": false, + "legendFormat": "{{run_id}}-15m", + "range": true, + "refId": "B" } ], - "title": "Queue Latency (P99)", + "title": "Queue Latency (P$percentile)", "type": "timeseries" }, { @@ -443,13 +455,25 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", - "expr": "histogram_quantile(0.99,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[$__rate_interval]\n ))\n)", - "legendFormat": "__auto", + "expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[1m]\n ))\n)", + "legendFormat": "{{run_id}}-1m", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100,\n sum by (run_id, le) (rate(\n experiment_website_reconciliation_duration_seconds_bucket{\n job=\"experiment\", run_id=~\"$run_id\"\n }[15m]\n ))\n)", + "hide": false, + "legendFormat": "{{run_id}}-15m", + "range": true, + "refId": "B" } ], - "title": "Reconciliation Latency (P99)", + "title": "Reconciliation Latency (P$percentile)", "type": "timeseries" }, { @@ -898,6 +922,33 @@ "regex": "", "sort": 1, "type": "query" + }, + { + "current": { + "text": "99", + "value": "99" + }, + "label": "SLO Percentile", + "name": "percentile", + "options": [ + { + "selected": false, + "text": "90", + "value": "90" + }, + { + "selected": false, + "text": "95", + "value": "95" + }, + { + "selected": true, + "text": "99", + "value": "99" + } + ], + "query": "90,95,99", + "type": "custom" } ] }, diff --git a/webhosting-operator/pkg/experiment/generator/reconciler.go b/webhosting-operator/pkg/experiment/generator/reconciler.go index 6c394f91..d43b0b7a 100644 --- a/webhosting-operator/pkg/experiment/generator/reconciler.go +++ b/webhosting-operator/pkg/experiment/generator/reconciler.go @@ -54,18 +54,33 @@ func (r *Every) AddToManager(mgr manager.Manager) error { r.Client = mgr.GetClient() } + initialDelay := time.Duration(0) workers := defaultReconcileWorkers if r.Workers > 0 { workers = r.Workers } + var rateLimiter workqueue.TypedRateLimiter[reconcile.Request] = &workqueue.TypedBucketRateLimiter[reconcile.Request]{ + Limiter: rate.NewLimiter(r.Rate, int(r.Rate)), + } + if r.Rate < 1 { + // Special case for controllers running less frequent than every second: + // The token bucket rate limiter would not allow any events as burst is less than 1, so replace it with a custom + // rate limiter that always returns a constant delay. + // Also, delay the first request when starting the scenario. + every := time.Duration(1 / float64(r.Rate) * float64(time.Second)) + rateLimiter = constantDelayRateLimiter(every) + initialDelay = every + workers = 1 + } + return builder.ControllerManagedBy(mgr). Named(r.Name). WithOptions(controller.Options{ MaxConcurrentReconciles: workers, - RateLimiter: &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(r.Rate, int(r.Rate))}, + RateLimiter: rateLimiter, }). - WatchesRawSource(EmitN(workers)). + WatchesRawSource(EmitN(workers, initialDelay)). Complete(StopOnContextCanceled(r)) } diff --git a/webhosting-operator/pkg/experiment/generator/utils.go b/webhosting-operator/pkg/experiment/generator/utils.go index 4e2e68fc..0e563378 100644 --- a/webhosting-operator/pkg/experiment/generator/utils.go +++ b/webhosting-operator/pkg/experiment/generator/utils.go @@ -37,21 +37,21 @@ import ( var log = logf.Log -// EmitN returns a source that emits exactly n events (reconcile.Request). The source ignores predicates. +// EmitN returns a source that emits exactly n reconcile requests with the given delay. // Use it with the controller builder: // -// WatchesRawSource(EmitN(n), &handler.EnqueueRequestForObject{}) +// WatchesRawSource(EmitN(n, time.Second)) // // Or a plain controller: // -// Watch(EmitN(n), &handler.EnqueueRequestForObject{}) -func EmitN(n int) source.Source { +// Watch(EmitN(n, time.Second)) +func EmitN(n int, delay time.Duration) source.Source { return source.TypedFunc[reconcile.Request](func(ctx context.Context, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) error { for i := 0; i < n; i++ { - queue.Add(reconcile.Request{NamespacedName: types.NamespacedName{ + queue.AddAfter(reconcile.Request{NamespacedName: types.NamespacedName{ // use different object names, otherwise queue will merge the requests Name: fmt.Sprintf("request-%d", n), - }}) + }}, delay) } return nil @@ -166,3 +166,12 @@ func CreateClusterScopedOwnerObject(ctx context.Context, c client.Client, opts . return ownerObject, metav1.NewControllerRef(ownerObject, rbacv1.SchemeGroupVersion.WithKind("ClusterRole")), nil } + +var _ workqueue.TypedRateLimiter[reconcile.Request] = constantDelayRateLimiter(0) + +// constantDelayRateLimiter delays all requests with a constant duration. +type constantDelayRateLimiter time.Duration + +func (d constantDelayRateLimiter) When(reconcile.Request) time.Duration { return time.Duration(d) } +func (d constantDelayRateLimiter) Forget(reconcile.Request) {} +func (d constantDelayRateLimiter) NumRequeues(reconcile.Request) int { return 0 } diff --git a/webhosting-operator/pkg/experiment/scenario/all/all.go b/webhosting-operator/pkg/experiment/scenario/all/all.go index 31fbe040..8aa3c86a 100644 --- a/webhosting-operator/pkg/experiment/scenario/all/all.go +++ b/webhosting-operator/pkg/experiment/scenario/all/all.go @@ -19,5 +19,6 @@ package all import ( _ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/basic" + _ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/chaos" _ "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/scale-out" ) diff --git a/webhosting-operator/pkg/experiment/scenario/basic/basic.go b/webhosting-operator/pkg/experiment/scenario/basic/basic.go index 61f9fb48..b92ab625 100644 --- a/webhosting-operator/pkg/experiment/scenario/basic/basic.go +++ b/webhosting-operator/pkg/experiment/scenario/basic/basic.go @@ -47,7 +47,7 @@ type scenario struct { } func (s *scenario) Description() string { - return "Basic load test scenario (15m) that creates roughly 9k websites" + return "Basic load test, create 9k websites in 15 minutes" } func (s *scenario) LongDescription() string { diff --git a/webhosting-operator/pkg/experiment/scenario/chaos/chaos.go b/webhosting-operator/pkg/experiment/scenario/chaos/chaos.go new file mode 100644 index 00000000..f60bf837 --- /dev/null +++ b/webhosting-operator/pkg/experiment/scenario/chaos/chaos.go @@ -0,0 +1,138 @@ +/* +Copyright 2025 Tim Ebert. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package chaos + +import ( + "context" + "fmt" + "time" + + "golang.org/x/time/rate" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + webhostingv1alpha1 "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/apis/webhosting/v1alpha1" + "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment" + "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/generator" + "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/experiment/scenario/base" + "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/utils" +) + +const ScenarioName = "chaos" + +func init() { + s := &scenario{} + s.Scenario = &base.Scenario{ + ScenarioName: ScenarioName, + Delegate: s, + } + + experiment.RegisterScenario(s) +} + +type scenario struct { + *base.Scenario +} + +func (s *scenario) Description() string { + return "Create 4.5k websites over 15 minutes and terminate a random shard every 5 minutes" +} + +func (s *scenario) LongDescription() string { + return `The ` + ScenarioName + ` scenario generates load and chaos for the webhosting-operator: +- website creation: 4500 over 15m +- website spec changes: 0.5/m per object, max 37.5/s +- shard termination (pod deletion): 1/m +` +} + +func (s *scenario) Prepare(ctx context.Context) error { + s.Log.Info("Preparing themes") + if err := generator.CreateThemes(ctx, s.Client, 50, generator.WithLabels(s.Labels), generator.WithOwnerReference(s.OwnerRef)); err != nil { + return err + } + + s.Log.Info("Preparing projects") + if err := generator.CreateProjects(ctx, s.Client, 20, generator.WithLabels(s.Labels), generator.WithOwnerReference(s.OwnerRef)); err != nil { + return err + } + + return nil +} + +func (s *scenario) Run(ctx context.Context) error { + // website-generator: creates about 4500 websites over 15 minutes + if err := (&generator.Every{ + Name: "website-generator", + Do: func(ctx context.Context, c client.Client) error { + return generator.CreateWebsite(ctx, c, generator.WithLabels(s.Labels)) + }, + Rate: rate.Limit(5), + }).AddToManager(s.Manager); err != nil { + return fmt.Errorf("error adding website-generator: %w", err) + } + + // trigger individual spec changes for website every other minute + // => peaks at about 37.5 spec changes per second at the end of the experiment + // (triggers roughly double the reconciliation rate in website controller because of deployment watches) + if err := (&generator.ForEach[*webhostingv1alpha1.Website]{ + Name: "website-mutator", + Do: func(ctx context.Context, c client.Client, obj *webhostingv1alpha1.Website) error { + return client.IgnoreNotFound(generator.MutateWebsite(ctx, c, obj, s.Labels)) + }, + Every: 2 * time.Minute, + }).AddToManager(s.Manager); err != nil { + return fmt.Errorf("error adding website-mutator: %w", err) + } + + // Terminate a random shard every 5 minutes + if err := (&generator.Every{ + Name: "shard-terminator", + Do: terminateRandomShard, + Rate: rate.Every(5 * time.Minute), + }).AddToManager(s.Manager); err != nil { + return fmt.Errorf("error adding shard-terminator: %w", err) + } + + return s.Wait(ctx, 15*time.Minute) +} + +func terminateRandomShard(ctx context.Context, c client.Client) error { + log := logf.FromContext(ctx) + + podList := &corev1.PodList{} + if err := c.List(ctx, podList, + client.InNamespace(webhostingv1alpha1.NamespaceSystem), + client.MatchingLabels{"app.kubernetes.io/name": webhostingv1alpha1.WebhostingOperatorName}, + ); err != nil { + return err + } + + if len(podList.Items) == 0 { + log.Info("No shards found, skipping termination") + return nil + } + + pod := utils.PickRandom(podList.Items) + if err := c.Delete(ctx, &pod); err != nil { + return err + } + + log.Info("Terminated shard", "pod", client.ObjectKeyFromObject(&pod)) + return nil +} diff --git a/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go b/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go index 9977251d..b9c73ff6 100644 --- a/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go +++ b/webhosting-operator/pkg/experiment/scenario/scale-out/scale_out.go @@ -47,7 +47,7 @@ type scenario struct { } func (s *scenario) Description() string { - return "Scenario for testing scale-out with high churn rate" + return "Measure scale-out properties with a high churn rate" } func (s *scenario) LongDescription() string {