From 3bd91de72b49a01621aa28a1bf4746d25adc9a80 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Thu, 14 Aug 2025 17:27:56 +0200 Subject: [PATCH 1/5] initial commit for quota testing --- cmd/cluster-etcd-operator/main.go | 2 + docs/loadtesting.md | 200 ++++++++++++++++++++++++ hack/featuregate_quota.yaml | 14 ++ pkg/cmd/load/load.go | 243 ++++++++++++++++++++++++++++++ 4 files changed, 459 insertions(+) create mode 100644 docs/loadtesting.md create mode 100644 hack/featuregate_quota.yaml create mode 100644 pkg/cmd/load/load.go diff --git a/cmd/cluster-etcd-operator/main.go b/cmd/cluster-etcd-operator/main.go index 6517dcef06..4780ab721b 100644 --- a/cmd/cluster-etcd-operator/main.go +++ b/cmd/cluster-etcd-operator/main.go @@ -4,6 +4,7 @@ import ( "context" goflag "flag" "fmt" + "github.com/openshift/cluster-etcd-operator/pkg/cmd/load" "io/ioutil" "math/rand" "os" @@ -78,6 +79,7 @@ func NewSSCSCommand(ctx context.Context) *cobra.Command { cmd.AddCommand(requestbackup.NewRequestBackupCommand(ctx)) cmd.AddCommand(rev.NewRevCommand(ctx)) cmd.AddCommand(backuprestore.NewBackupServer(ctx)) + cmd.AddCommand(load.NewLoadCommand(ctx)) return cmd } diff --git a/docs/loadtesting.md b/docs/loadtesting.md new file mode 100644 index 0000000000..f04cb7ac1e --- /dev/null +++ b/docs/loadtesting.md @@ -0,0 +1,200 @@ +# OpenShift etcd load testing + +While enabling the higher backend quota sizes, we needed to fill the etcd cluster with several gigabytes worth of data. +This sounds trivial, but has some surprising gotchas when using OpenShift. + +This doc gives a viable path to test large quotas, but without crashing your cluster. + +## What doesn't work and why + +The main limitation to just create a bunch of objects in etcd is the 2GiB limit on sending and receiving data with +GRPC [1]. +Imagine a secret listing across all namespaces call to the kube-apiserver, which is a very common theme many operators +will do in OpenShift (e.g. Prometheus or the Ingress Router to access your metrics and routes with certificates). + +etcd can only ever send back 2GiB worth of key/values from such resource key prefix and this limit is reached very +quickly. +Assuming we can create a secret with the maximum request size of 1.5MiB, we can put about 1366 secrets before we +hit the GRPC listing limitation. + +Another limitation you'll run into when doing this sort of creation/listing is the watch cache memory usage on +apiserver. +You can read more about this in [2], but the TL;DR is that the apiserver will allocate about O(watchers*page-size* +object-size*5) +bytes of memory. When there are ten watchers listing 2GiB of secrets, it would consume about 100 GiB of memory on the +apiserver. + +The above reasons drive the need to shard across different resources, and we're glad to have CRDs (custom resource +definitions). +Yet, when creating many thousand CRDs, you may run into another memory limitation: OpenAPISpec conversion. +About ten thousand CRDs will require 32GiB of RAM just to store and convert the OpenAPI definitions. Lack of RAM means +your control plane will thrash into a long winding and cascading death loop trying to swap in and out enough memory. + +## Enabling the quota feature + +Currently, the quota is in tech preview mode, so you have to manually enable it via: + +``` +$ oc apply -f hack/featuregate_quota.yaml +``` + +The quota can then be set up to 32GiBs using: + +``` +$ oc patch etcd/cluster --type=merge -p '{"spec": {"backendQuotaGiB": 32}}' +``` + +## KubeBurner + +Given we have to be much smarter when it comes to load testing, we can devise a two-step kube burner benchmark. +The first kube-burner run will set up a few hundred CRDs to not overwhelm apiserver, the second will fill each with +objects that contain up to 1.5MiB of string data. + +Save the upstream CRD example template below as: + +``` +cat <> example-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: kubeburners{{.Iteration}}.cloudbulldozer{{.Iteration}}.example.com +spec: + group: cloudbulldozer{{.Iteration}}.example.com + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + workload: + type: string + iterations: + type: integer + scope: Cluster + names: + plural: kubeburners{{.Iteration}} + singular: kubeburner{{.Iteration}} + kind: KubeBurner{{.Iteration}} + shortNames: + - kb{{.Iteration}} +EOF +``` + +The following benchmark is our starter: + +``` +cat <> crd-scale.yaml +global: + gc: false +jobs: + - name: crd-scale + jobIterations: 100 + qps: 10 + burst: 10 + namespacedIterations: false + namespace: crd-scale + waitWhenFinished: false + objects: + - objectTemplate: example-crd.yaml + replicas: 1 +EOF +``` + +and you can run it with: + +``` +$ kube-burner init -c crd-scale.yaml +``` + +After a quick run, this should give you one hundred CRDs: + +``` +$ kubectl get crds -oname | grep burner +customresourcedefinition.apiextensions.k8s.io/kubeburners0.cloudbulldozer0.example.com +... +customresourcedefinition.apiextensions.k8s.io/kubeburners99.cloudbulldozer99.example.com +``` + +which we need to fill using another two sets of yaml files: + +``` +cat <> example-crd-instance.yaml +apiVersion: cloudbulldozer${i}.example.com/v1 +kind: KubeBurner${i} +metadata: + name: kb-{{.Iteration}}-{{.Replica}}-x +spec: + workload: "{{ randAlphaNum 10000 }}" + iterations: {{.Iteration}} +EOF +``` + +which is our actual resource that's taking the space. Note that we need to template the apiversion and kind with +a shell substitution due to a limitation in kube-burner [3]. + +Along with a new benchmark: + +``` +cat <> crd-fill.yaml +global: + gc: false +jobs: + - name: crd-fill + jobIterations: 100 + qps: 20 + burst: 20 + namespacedIterations: false + namespace: crd-fill + waitWhenFinished: false + objects: + {{- range $i , $val := until 100 }} + - objectTemplate: example-crd-instance-{{ $val }}.yaml + replicas: 100 + {{- end }} +EOF +``` + +Then generate all templates and run it with: + +``` +for i in {0..100}; do + cat example-crd-instance.yaml | i=$i envsubst > "example-crd-instance-${i}.yaml" +done + +$ kube-burner init -c crd-fill.yaml +``` + +An important place where etcd / CEO breaks, is the defrag routine. The defrag takes linear time w.r.t. the size of the +database and during the defragmentation the etcd member will not respond to any RPC calls. That may cause liveness +probes +to fail and crash loop the etcd instance. + +To also test defragmentation, we can run a third step to delete the CRD objects. CEO will then attempt to defrag +automatically: + +``` +$ kubectl get crd -oname | grep kubeburner | xargs kubectl delete +``` + +## CEO + +The above section on KubeBurner is also built into the CEO commandline utility. +You can run it from the CEO container directly, or alternatively, passing a kubeconfig on your local machine: + +``` +# uses the in-cluster config +$ cluster-etcd-operator load --g 20 +$ cluster-etcd-operator load --g 20 --kubeconfig $KUBECONFIG +``` + +which will load 20 gigabytes worth of namespaced CRDs and their content into the cluster via the apiserver. +Note, unlike kube-burner, this will not clean up anything, you will have to manually delete the namespaces. + +[1] https://github.com/grpc/grpc-go/issues/6623 +[2] https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/3157-watch-list +[3] https://github.com/kube-burner/kube-burner/issues/862#issuecomment-2887285321 \ No newline at end of file diff --git a/hack/featuregate_quota.yaml b/hack/featuregate_quota.yaml new file mode 100644 index 0000000000..e983f006b2 --- /dev/null +++ b/hack/featuregate_quota.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: config.openshift.io/v1 +kind: FeatureGate +metadata: + annotations: + include.release.openshift.io/self-managed-high-availability: "true" + include.release.openshift.io/single-node-developer: "true" + release.openshift.io/create-only: "true" + name: cluster +spec: + customNoUpgrade: + enabled: + - EtcdBackendQuota + featureSet: CustomNoUpgrade diff --git a/pkg/cmd/load/load.go b/pkg/cmd/load/load.go new file mode 100644 index 0000000000..831fdcb832 --- /dev/null +++ b/pkg/cmd/load/load.go @@ -0,0 +1,243 @@ +package load + +import ( + "context" + goflag "flag" + "fmt" + "github.com/spf13/cobra" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + crdclientv1 "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/util/retry" + "math/rand" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" +) + +type loadOpts struct { + loadGigabytes int + kubeconfigPath string +} + +func newLoadOpts() *loadOpts { + return &loadOpts{} +} + +func NewLoadCommand(ctx context.Context) *cobra.Command { + opts := newLoadOpts() + cmd := &cobra.Command{ + Use: "load", + Short: "Loads several gigabytes of CRDs into the cluster via the apiserver.", + Run: func(cmd *cobra.Command, args []string) { + defer klog.Flush() + + if err := opts.Run(ctx); err != nil { + klog.Fatal(err) + } + }, + } + + opts.AddFlags(cmd) + return cmd +} + +func (r *loadOpts) AddFlags(cmd *cobra.Command) { + fs := cmd.Flags() + fs.IntVar(&r.loadGigabytes, "g", 8, "How many gigabytes to load, defaults to 8") + fs.StringVar(&r.kubeconfigPath, "kubeconfig", "", "Path to the kubeconfig file to use for CLI requests, uses in-cluster config otherwise") + + // adding klog flags to tune verbosity better + gfs := goflag.NewFlagSet("", goflag.ExitOnError) + klog.InitFlags(gfs) + cmd.Flags().AddGoFlagSet(gfs) +} + +func (r *loadOpts) Run(ctx context.Context) error { + var clientConfig *rest.Config + if r.kubeconfigPath != "" { + cc, err := clientcmd.BuildConfigFromFlags("", r.kubeconfigPath) + if err != nil { + return err + } + clientConfig = cc + } else { + klog.Infof("assuming in-cluster config") + cc, err := rest.InClusterConfig() + if err != nil { + return err + } + clientConfig = cc + } + + protoConfig := rest.CopyConfig(clientConfig) + protoConfig.AcceptContentTypes = "application/vnd.kubernetes.protobuf,application/json" + protoConfig.ContentType = "application/vnd.kubernetes.protobuf" + kubeClient, err := kubernetes.NewForConfig(protoConfig) + if err != nil { + return err + } + dynClient, err := dynamic.NewForConfig(protoConfig) + if err != nil { + return err + } + crdClient, err := crdclientv1.NewForConfig(protoConfig) + if err != nil { + return err + } + + randSuffix := randSeq(5) + klog.Info("generated random suffix: " + randSuffix) + randomContent := randSeq(1024 * 1024 * 1.25) + + for i := 0; i < r.loadGigabytes; i++ { + klog.Infof("loading gigabyte #%d", i+1) + ns, err := r.createNamespace(ctx, kubeClient, randSuffix, i) + if err != nil { + return err + } + klog.Infof("created namespace %s", ns.Name) + + createdCrd, err := r.createCustomResource(ctx, crdClient, randSuffix, i) + if err != nil { + return err + } + + klog.Infof("created CRD %s", createdCrd.Name) + + gvr := schema.GroupVersionResource{Group: createdCrd.Spec.Group, Version: "v1", Resource: createdCrd.Spec.Names.Kind} + klog.Infof("waiting for CRD %s to become ready...", createdCrd.Name) + err = r.waitForCRD(ctx, createdCrd, randSuffix, ns, dynClient, gvr) + if err != nil { + return err + } + + klog.Infof("CRD %s is ready, loading a gigabyte of data now...", createdCrd.Name) + // one CRD will create about 1.3mb worth of data, we want to create about a gigabyte worth of it + numIterations := 800 + for j := 0; j < numIterations; j++ { + err := retry.OnError(wait.Backoff{Steps: 10, Duration: 1 * time.Second, Factor: 2, Jitter: 0.1}, func(err error) bool { + return errors.IsServerTimeout(err) || errors.IsConflict(err) || + errors.IsTooManyRequests(err) || errors.IsServiceUnavailable(err) || + errors.IsTimeout(err) || strings.Contains(err.Error(), "etcdserver: request timed out") || + strings.Contains(err.Error(), "unexpected EOF") + }, func() error { + k := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": gvr.GroupVersion().String(), + "kind": createdCrd.Spec.Names.Kind, + "metadata": map[string]interface{}{ + "name": fmt.Sprintf("ceo-load-inst-%s-%d", randSuffix, j), + "namespace": ns.Name, + }, + "spec": map[string]interface{}{ + "content": randomContent, + }, + }} + + _, err := dynClient.Resource(gvr).Namespace(ns.Name).Create(ctx, k, metav1.CreateOptions{}) + if err != nil { + klog.Errorf("failed to create instance of CRD %s/%s in namespace %s: %v", createdCrd.Name, k.GetName(), ns.Name, err) + return err + } + + return nil + }) + if err != nil { + return err + } + + if j%10 == 0 { + klog.Infof("created CRD instance %d/%d of CRD %s in namespace %s", j, numIterations, createdCrd.Name, ns.Name) + } + } + } + + klog.Info("done") + + return nil +} + +func (r *loadOpts) waitForCRD(ctx context.Context, createdCrd *v1.CustomResourceDefinition, randSuffix string, ns *corev1.Namespace, dynClient *dynamic.DynamicClient, gvr schema.GroupVersionResource) error { + // the discovery of the CRD may take some time and thus the api would return 404 errors in the meantime + return retry.OnError(wait.Backoff{Steps: 10, Duration: 5 * time.Second, Factor: 2, Jitter: 0.1}, errors.IsNotFound, func() error { + k := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": gvr.GroupVersion().String(), + "kind": createdCrd.Spec.Names.Kind, + "metadata": map[string]interface{}{ + "name": fmt.Sprintf("ceo-load-inst-%s-test", randSuffix), + "namespace": ns.Name, + }, + "spec": map[string]interface{}{ + "content": "test", + }, + }} + _, err := dynClient.Resource(gvr).Namespace(ns.Name).Create(ctx, k, metav1.CreateOptions{}) + if err != nil { + klog.Errorf("failed to create test instance of CRD %s/%s in namespace %s: %v", createdCrd.Name, k.GetName(), ns.Name, err) + return err + } + return nil + }) +} + +func (r *loadOpts) createNamespace(ctx context.Context, kubeClient *kubernetes.Clientset, randSuffix string, i int) (*corev1.Namespace, error) { + return kubeClient.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("ceo-load-%s-%d", randSuffix, i)}}, metav1.CreateOptions{}) +} + +func (r *loadOpts) createCustomResource(ctx context.Context, crdClient *crdclientv1.Clientset, randSuffix string, i int) (*v1.CustomResourceDefinition, error) { + crd := &v1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("load-crd-%s-%d.g%d.openshift.com", randSuffix, i, i), + }, + Spec: v1.CustomResourceDefinitionSpec{ + Group: fmt.Sprintf("g%d.openshift.com", i), + Versions: []v1.CustomResourceDefinitionVersion{ + {Name: "v1", + Served: true, + Storage: true, + Schema: &v1.CustomResourceValidation{ + OpenAPIV3Schema: &v1.JSONSchemaProps{Type: "object", + Properties: map[string]v1.JSONSchemaProps{ + "spec": {Type: "object", Properties: map[string]v1.JSONSchemaProps{ + "content": {Type: "string"}, + }}, + }}, + }}, + }, + Scope: v1.NamespaceScoped, + Names: v1.CustomResourceDefinitionNames{ + Plural: fmt.Sprintf("load-crd-%s-%d", randSuffix, i), + Singular: fmt.Sprintf("load-crd-%s-%d", randSuffix, i), + Kind: fmt.Sprintf("load-crd-%s-%d", randSuffix, i), + ShortNames: []string{ + fmt.Sprintf("lcrd-%s-%d", randSuffix, i), + }, + }, + }, + } + createdCrd, err := crdClient.ApiextensionsV1().CustomResourceDefinitions().Create(ctx, crd, metav1.CreateOptions{}) + if err != nil { + return nil, err + } + return createdCrd, nil +} + +func randSeq(n int) string { + l := []rune("abcdefghijklmnopqrstuvwxyz") + b := make([]rune, n) + for i := range b { + b[i] = l[rand.Intn(len(l))] + } + return string(b) +} From 8f290d800a59cf90242c19b413da51585a5dc714 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Mon, 18 Aug 2025 17:11:29 +0200 Subject: [PATCH 2/5] more errors --- docs/loadtesting.md | 5 ++--- pkg/cmd/load/load.go | 7 +++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/loadtesting.md b/docs/loadtesting.md index f04cb7ac1e..b66be87dc4 100644 --- a/docs/loadtesting.md +++ b/docs/loadtesting.md @@ -171,8 +171,7 @@ $ kube-burner init -c crd-fill.yaml An important place where etcd / CEO breaks, is the defrag routine. The defrag takes linear time w.r.t. the size of the database and during the defragmentation the etcd member will not respond to any RPC calls. That may cause liveness -probes -to fail and crash loop the etcd instance. +probes to fail and crash loop the etcd instance. To also test defragmentation, we can run a third step to delete the CRD objects. CEO will then attempt to defrag automatically: @@ -197,4 +196,4 @@ Note, unlike kube-burner, this will not clean up anything, you will have to manu [1] https://github.com/grpc/grpc-go/issues/6623 [2] https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/3157-watch-list -[3] https://github.com/kube-burner/kube-burner/issues/862#issuecomment-2887285321 \ No newline at end of file +[3] https://github.com/kube-burner/kube-burner/issues/862#issuecomment-2887285321 diff --git a/pkg/cmd/load/load.go b/pkg/cmd/load/load.go index 831fdcb832..4166bd59c7 100644 --- a/pkg/cmd/load/load.go +++ b/pkg/cmd/load/load.go @@ -127,10 +127,13 @@ func (r *loadOpts) Run(ctx context.Context) error { numIterations := 800 for j := 0; j < numIterations; j++ { err := retry.OnError(wait.Backoff{Steps: 10, Duration: 1 * time.Second, Factor: 2, Jitter: 0.1}, func(err error) bool { + sErr := err.Error() return errors.IsServerTimeout(err) || errors.IsConflict(err) || errors.IsTooManyRequests(err) || errors.IsServiceUnavailable(err) || - errors.IsTimeout(err) || strings.Contains(err.Error(), "etcdserver: request timed out") || - strings.Contains(err.Error(), "unexpected EOF") + errors.IsTimeout(err) || strings.Contains(sErr, "etcdserver: request timed out") || + strings.Contains(sErr, "unexpected EOF") || + strings.Contains(sErr, "context deadline exceeded") || + strings.Contains(sErr, "rpc error: code = Unavailable") }, func() error { k := &unstructured.Unstructured{Object: map[string]interface{}{ "apiVersion": gvr.GroupVersion().String(), From 4ee69f4d8e2a970ed0eb4e89dbabbd36c2ed5a1e Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Wed, 27 Aug 2025 08:48:57 +0200 Subject: [PATCH 3/5] add more --- docs/load-testing.md | 279 +++++++++++++++++++++++++++++++++++++++++++ docs/loadtesting.md | 199 ------------------------------ 2 files changed, 279 insertions(+), 199 deletions(-) create mode 100644 docs/load-testing.md delete mode 100644 docs/loadtesting.md diff --git a/docs/load-testing.md b/docs/load-testing.md new file mode 100644 index 0000000000..519f715c8f --- /dev/null +++ b/docs/load-testing.md @@ -0,0 +1,279 @@ +# OpenShift etcd load testing + +While enabling the higher backend quota sizes, we needed to fill the etcd cluster with several gigabytes worth of data. +This sounds trivial, but has some surprising gotchas when using OpenShift. + +This doc gives a viable path to test large quotas, but without crashing your cluster. + +## What doesn't work and why + +The main limitation to just create a bunch of objects in etcd is the 2GiB limit on sending and receiving data with +GRPC [1]. +Imagine a secret listing across all namespaces call to the kube-apiserver, which is a very common theme many operators +will do in OpenShift (e.g. Prometheus or the Ingress Router to access your metrics and routes with certificates). + +etcd can only ever send back 2GiB worth of key/values from such resource key prefix and this limit is reached very +quickly. Assuming we can create a secret with the maximum request size of 1.5MiB, we can put about 1366 secrets before we +hit the GRPC listing limitation. + +Another limitation you'll run into when doing this sort of creation/listing is the watch cache memory usage on +apiserver. You can read more about this in [2], but the TL;DR is that the apiserver will allocate about O(watchers*page-size* +object-size*5) bytes of memory. When there are ten watchers listing 2GiB of secrets, it would consume about 100 GiB of memory on the +apiserver. + +You can alleviate some of it by enabling the `ResilientWatchCacheInitialization` feature gate on kube-apiserver, which should become the default starting with OCP 4.21. + +The above reasons drive the need to shard across different resources, for maximum flexibility we can use CRDs. +Yet, when creating many thousand CRDs, you may run into another memory limitation: OpenAPISpec conversion. +About ten thousand CRDs will require 32GiB of RAM just to store and convert the OpenAPI definitions. + +Closely related to high watch cache memory consumption is also the kube-controller-manager, which lists and watches all resources in the entire cluster [3]. +This is primarily driven by two controllers, the garbage-collector- and the resourcequota-controller. +Both may need to refresh their caches (e.g. due to restarts on certificate rotation), which causes them to "watch storm" the entire control plane. +If you do not have enough memory to absorb this, you may end up in a cascading thrash fest of the control plane. + +### Getting out of the thrash loop + +This manifests usually by having two control plane nodes, out of three, having high load, CPU and disk IO due to swapping. +After the first node is down with OOM, it will load over to the second and cause a cascading failure. +The third will not be impacted anymore, because after the second node the etcd quorum is broken and nothing can be read from the +last remaining etcd member anymore. + +Because these two machines are usually inaccessible at this point, you may need to reset/restart them physically or via KVM. +When you're in a cloud setting, you may also consider to scale-up the machine type to a higher memory SKU at this point to absorb +the additional memory required by the watch caches. + +After a restart, you have a brief period before the kube-apiserver and KCM static pods come up again, so the first step is to move their +static pod manifest on all control plane nodes: + +``` +$ sudo mv /etc/kubernetes/manifests/kube-apiserver-pod.yaml . +$ sudo mv /etc/kubernetes/manifests/kube-controller-manager-pod.yaml . +``` + +This ensures that etcd can come up on its own, we can verify this by running crictl: + +``` +$ sudo crictl exec $(sudo crictl ps --name etcdctl -q) etcdctl endpoint status -wtable ++-----------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ +| ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | ++-----------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ +| https://10.0.0.3:2379 | e881076f3e58fb35 | 3.5.21 | 23 GB | true | false | 16 | 422782 | 422782 | | +| https://10.0.0.5:2379 | 1eb1009e96533fa7 | 3.5.21 | 23 GB | false | false | 16 | 422782 | 422782 | | +| https://10.0.0.6:2379 | 51ff8a825f19c7d2 | 3.5.21 | 23 GB | false | false | 16 | 422782 | 422782 | | ++-----------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ +``` + +Now, pick a node where you can safely bring the apiserver back again using: + +``` +$ sudo mv kube-apiserver-pod.yaml /etc/kubernetes/manifests/ +``` + +Wait and observe the memory usage using `free` or `top`. After starting up successfully, the apiserver should respond again from `oc`. +If the memory usage is still not stable for continuous usage, and you're not able to add more RAM, you have to resort to deleting data directly from etcd. + +Ensure that the apiserver is shut down again by moving the static pod out as described earlier, then ensure etcd is stable. +In the load testing case, we can very easily delete the CRD prefixes we have created: + +``` +# contains keys like: +# /kubernetes.io/g3.openshift.com/load-crd-apwzf-8/ceo-load-apwzf-8/ceo-load-inst-apwzf-641 +# /kubernetes.io/g3.openshift.com/load-crd-apwzf-8/ceo-load-apwzf-8/ceo-load-inst-apwzf-642 + +$ DANGER $ sudo crictl exec $(sudo crictl ps --name etcdctl -q) etcdctl del --prefix /kubernetes.io/g3.openshift.com/ +``` + +If you have deleted enough CRDs, you may be able to bring apiserver back up again to start with stable memory usage. + +Now, bringing back KCM is the last step. +You may also try to disable the respective controllers on KCM to avoid memory spikes and spurious watch storms for starters. + +This can be done by injecting an `unsupportedConfigOverrides`: +``` +$ oc edit kubecontrollermanager.operator.openshift.io/cluster +spec: + unsupportedConfigOverrides: + extendedArguments: + controllers: ["*","-ttl","-bootstrapsigner","-tokencleaner","selinux-warning-controller","-garbage-collector-controller","-resourcequota-controller"] +``` + +The minus in front of the controller name will disable them. + +The operator (if still running) will then proceed to roll out a new static pod. In the likely case that nothing will happen, +because no pods can be scheduled without KCM, you can also resort to editing the latest revision config directly on the node +and then moving the static pod back again: + +``` +$ sudo vi /etc/kubernetes/static-pod-resources/kube-controller-manager-pod-16/configmaps/config/config.yaml +$ sudo mv kube-controller-manager-pod.yaml /etc/kubernetes/manifests/ +``` + +## Enabling the quota feature + +Currently, the quota is in tech preview mode, so you have to manually enable it via: + +``` +$ oc apply -f hack/featuregate_quota.yaml +``` + +The quota can then be set up to 32GiBs using: + +``` +$ oc patch etcd/cluster --type=merge -p '{"spec": {"backendQuotaGiB": 32}}' +``` + +## KubeBurner + +Given we have to be much smarter when it comes to load testing, we can devise a two-step kube burner benchmark. +The first kube-burner run will set up a few hundred CRDs to not overwhelm apiserver, the second will fill each with +objects that contain up to 1.5MiB of string data. + +Save the upstream CRD example template below as: + +``` +cat <> example-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: kubeburners{{.Iteration}}.cloudbulldozer{{.Iteration}}.example.com +spec: + group: cloudbulldozer{{.Iteration}}.example.com + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + workload: + type: string + iterations: + type: integer + scope: Cluster + names: + plural: kubeburners{{.Iteration}} + singular: kubeburner{{.Iteration}} + kind: KubeBurner{{.Iteration}} + shortNames: + - kb{{.Iteration}} +EOF +``` + +The following benchmark is our starter: + +``` +cat <> crd-scale.yaml +global: + gc: false +jobs: + - name: crd-scale + jobIterations: 100 + qps: 10 + burst: 10 + namespacedIterations: false + namespace: crd-scale + waitWhenFinished: false + objects: + - objectTemplate: example-crd.yaml + replicas: 1 +EOF +``` + +and you can run it with: + +``` +$ kube-burner init -c crd-scale.yaml +``` + +After a quick run, this should give you one hundred CRDs: + +``` +$ kubectl get crds -oname | grep burner +customresourcedefinition.apiextensions.k8s.io/kubeburners0.cloudbulldozer0.example.com +... +customresourcedefinition.apiextensions.k8s.io/kubeburners99.cloudbulldozer99.example.com +``` + +which we need to fill using another two sets of yaml files: + +``` +cat <> example-crd-instance.yaml +apiVersion: cloudbulldozer${i}.example.com/v1 +kind: KubeBurner${i} +metadata: + name: kb-{{.Iteration}}-{{.Replica}}-x +spec: + workload: "{{ randAlphaNum 10000 }}" + iterations: {{.Iteration}} +EOF +``` + +which is our actual resource that's taking the space. Note that we need to template the apiversion and kind with +a shell substitution due to a limitation in kube-burner [4]. + +Along with a new benchmark: + +``` +cat <> crd-fill.yaml +global: + gc: false +jobs: + - name: crd-fill + jobIterations: 100 + qps: 20 + burst: 20 + namespacedIterations: false + namespace: crd-fill + waitWhenFinished: false + objects: + {{- range $i , $val := until 100 }} + - objectTemplate: example-crd-instance-{{ $val }}.yaml + replicas: 100 + {{- end }} +EOF +``` + +Then generate all templates and run it with: + +``` +for i in {0..100}; do + cat example-crd-instance.yaml | i=$i envsubst > "example-crd-instance-${i}.yaml" +done + +$ kube-burner init -c crd-fill.yaml +``` + +An important place where etcd / CEO breaks, is the defrag routine. The defrag takes linear time w.r.t. the size of the +database and during the defragmentation the etcd member will not respond to any RPC calls. That may cause liveness +probes to fail and crash loop the etcd instance. + +To also test defragmentation, we can run a third step to delete the CRD objects. CEO will then attempt to defrag +automatically: + +``` +$ kubectl get crd -oname | grep kubeburner | xargs kubectl delete +``` + +## CEO + +The above section on KubeBurner is also built into the CEO commandline utility. +You can run it from the CEO container directly, or alternatively, passing a kubeconfig on your local machine: + +``` +# uses the in-cluster config +$ cluster-etcd-operator load --g 20 +$ cluster-etcd-operator load --g 20 --kubeconfig $KUBECONFIG +``` + +which will load 20 gigabytes worth of namespaced CRDs and their content into the cluster via the apiserver. +Note, unlike kube-burner, this will not clean up anything, you will have to manually delete the namespaces. + +[1] https://github.com/grpc/grpc-go/issues/6623 +[2] https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/3157-watch-list +[3] https://github.com/kubernetes/kubernetes/issues/124680 +[4] https://github.com/kube-burner/kube-burner/issues/862#issuecomment-2887285321 diff --git a/docs/loadtesting.md b/docs/loadtesting.md deleted file mode 100644 index b66be87dc4..0000000000 --- a/docs/loadtesting.md +++ /dev/null @@ -1,199 +0,0 @@ -# OpenShift etcd load testing - -While enabling the higher backend quota sizes, we needed to fill the etcd cluster with several gigabytes worth of data. -This sounds trivial, but has some surprising gotchas when using OpenShift. - -This doc gives a viable path to test large quotas, but without crashing your cluster. - -## What doesn't work and why - -The main limitation to just create a bunch of objects in etcd is the 2GiB limit on sending and receiving data with -GRPC [1]. -Imagine a secret listing across all namespaces call to the kube-apiserver, which is a very common theme many operators -will do in OpenShift (e.g. Prometheus or the Ingress Router to access your metrics and routes with certificates). - -etcd can only ever send back 2GiB worth of key/values from such resource key prefix and this limit is reached very -quickly. -Assuming we can create a secret with the maximum request size of 1.5MiB, we can put about 1366 secrets before we -hit the GRPC listing limitation. - -Another limitation you'll run into when doing this sort of creation/listing is the watch cache memory usage on -apiserver. -You can read more about this in [2], but the TL;DR is that the apiserver will allocate about O(watchers*page-size* -object-size*5) -bytes of memory. When there are ten watchers listing 2GiB of secrets, it would consume about 100 GiB of memory on the -apiserver. - -The above reasons drive the need to shard across different resources, and we're glad to have CRDs (custom resource -definitions). -Yet, when creating many thousand CRDs, you may run into another memory limitation: OpenAPISpec conversion. -About ten thousand CRDs will require 32GiB of RAM just to store and convert the OpenAPI definitions. Lack of RAM means -your control plane will thrash into a long winding and cascading death loop trying to swap in and out enough memory. - -## Enabling the quota feature - -Currently, the quota is in tech preview mode, so you have to manually enable it via: - -``` -$ oc apply -f hack/featuregate_quota.yaml -``` - -The quota can then be set up to 32GiBs using: - -``` -$ oc patch etcd/cluster --type=merge -p '{"spec": {"backendQuotaGiB": 32}}' -``` - -## KubeBurner - -Given we have to be much smarter when it comes to load testing, we can devise a two-step kube burner benchmark. -The first kube-burner run will set up a few hundred CRDs to not overwhelm apiserver, the second will fill each with -objects that contain up to 1.5MiB of string data. - -Save the upstream CRD example template below as: - -``` -cat <> example-crd.yaml -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: kubeburners{{.Iteration}}.cloudbulldozer{{.Iteration}}.example.com -spec: - group: cloudbulldozer{{.Iteration}}.example.com - versions: - - name: v1 - served: true - storage: true - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - properties: - workload: - type: string - iterations: - type: integer - scope: Cluster - names: - plural: kubeburners{{.Iteration}} - singular: kubeburner{{.Iteration}} - kind: KubeBurner{{.Iteration}} - shortNames: - - kb{{.Iteration}} -EOF -``` - -The following benchmark is our starter: - -``` -cat <> crd-scale.yaml -global: - gc: false -jobs: - - name: crd-scale - jobIterations: 100 - qps: 10 - burst: 10 - namespacedIterations: false - namespace: crd-scale - waitWhenFinished: false - objects: - - objectTemplate: example-crd.yaml - replicas: 1 -EOF -``` - -and you can run it with: - -``` -$ kube-burner init -c crd-scale.yaml -``` - -After a quick run, this should give you one hundred CRDs: - -``` -$ kubectl get crds -oname | grep burner -customresourcedefinition.apiextensions.k8s.io/kubeburners0.cloudbulldozer0.example.com -... -customresourcedefinition.apiextensions.k8s.io/kubeburners99.cloudbulldozer99.example.com -``` - -which we need to fill using another two sets of yaml files: - -``` -cat <> example-crd-instance.yaml -apiVersion: cloudbulldozer${i}.example.com/v1 -kind: KubeBurner${i} -metadata: - name: kb-{{.Iteration}}-{{.Replica}}-x -spec: - workload: "{{ randAlphaNum 10000 }}" - iterations: {{.Iteration}} -EOF -``` - -which is our actual resource that's taking the space. Note that we need to template the apiversion and kind with -a shell substitution due to a limitation in kube-burner [3]. - -Along with a new benchmark: - -``` -cat <> crd-fill.yaml -global: - gc: false -jobs: - - name: crd-fill - jobIterations: 100 - qps: 20 - burst: 20 - namespacedIterations: false - namespace: crd-fill - waitWhenFinished: false - objects: - {{- range $i , $val := until 100 }} - - objectTemplate: example-crd-instance-{{ $val }}.yaml - replicas: 100 - {{- end }} -EOF -``` - -Then generate all templates and run it with: - -``` -for i in {0..100}; do - cat example-crd-instance.yaml | i=$i envsubst > "example-crd-instance-${i}.yaml" -done - -$ kube-burner init -c crd-fill.yaml -``` - -An important place where etcd / CEO breaks, is the defrag routine. The defrag takes linear time w.r.t. the size of the -database and during the defragmentation the etcd member will not respond to any RPC calls. That may cause liveness -probes to fail and crash loop the etcd instance. - -To also test defragmentation, we can run a third step to delete the CRD objects. CEO will then attempt to defrag -automatically: - -``` -$ kubectl get crd -oname | grep kubeburner | xargs kubectl delete -``` - -## CEO - -The above section on KubeBurner is also built into the CEO commandline utility. -You can run it from the CEO container directly, or alternatively, passing a kubeconfig on your local machine: - -``` -# uses the in-cluster config -$ cluster-etcd-operator load --g 20 -$ cluster-etcd-operator load --g 20 --kubeconfig $KUBECONFIG -``` - -which will load 20 gigabytes worth of namespaced CRDs and their content into the cluster via the apiserver. -Note, unlike kube-burner, this will not clean up anything, you will have to manually delete the namespaces. - -[1] https://github.com/grpc/grpc-go/issues/6623 -[2] https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/3157-watch-list -[3] https://github.com/kube-burner/kube-burner/issues/862#issuecomment-2887285321 From 05dc9c9788ab80049e1888129b4e9ce14d81b52a Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Wed, 27 Aug 2025 08:52:22 +0200 Subject: [PATCH 4/5] add more --- docs/load-testing.md | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/docs/load-testing.md b/docs/load-testing.md index 519f715c8f..659c0cb142 100644 --- a/docs/load-testing.md +++ b/docs/load-testing.md @@ -5,6 +5,22 @@ This sounds trivial, but has some surprising gotchas when using OpenShift. This doc gives a viable path to test large quotas, but without crashing your cluster. + +## Enabling the quota feature + +Currently, the quota is in tech preview mode, so you have to manually enable it via: + +``` +$ oc apply -f hack/featuregate_quota.yaml +``` + +The quota can then be set up to 32GiBs using: + +``` +$ oc patch etcd/cluster --type=merge -p '{"spec": {"backendQuotaGiB": 32}}' +``` + + ## What doesn't work and why The main limitation to just create a bunch of objects in etcd is the 2GiB limit on sending and receiving data with @@ -109,20 +125,6 @@ $ sudo vi /etc/kubernetes/static-pod-resources/kube-controller-manager-pod-16/co $ sudo mv kube-controller-manager-pod.yaml /etc/kubernetes/manifests/ ``` -## Enabling the quota feature - -Currently, the quota is in tech preview mode, so you have to manually enable it via: - -``` -$ oc apply -f hack/featuregate_quota.yaml -``` - -The quota can then be set up to 32GiBs using: - -``` -$ oc patch etcd/cluster --type=merge -p '{"spec": {"backendQuotaGiB": 32}}' -``` - ## KubeBurner Given we have to be much smarter when it comes to load testing, we can devise a two-step kube burner benchmark. From 566022829c1dd955d9b2ded25332f4ef531ce073 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Thu, 28 Aug 2025 12:17:10 +0200 Subject: [PATCH 5/5] add error --- pkg/cmd/load/load.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/cmd/load/load.go b/pkg/cmd/load/load.go index 4166bd59c7..4eda6ae55f 100644 --- a/pkg/cmd/load/load.go +++ b/pkg/cmd/load/load.go @@ -4,6 +4,10 @@ import ( "context" goflag "flag" "fmt" + "math/rand" + "strings" + "time" + "github.com/spf13/cobra" corev1 "k8s.io/api/core/v1" v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" @@ -15,9 +19,6 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/util/retry" - "math/rand" - "strings" - "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -133,7 +134,8 @@ func (r *loadOpts) Run(ctx context.Context) error { errors.IsTimeout(err) || strings.Contains(sErr, "etcdserver: request timed out") || strings.Contains(sErr, "unexpected EOF") || strings.Contains(sErr, "context deadline exceeded") || - strings.Contains(sErr, "rpc error: code = Unavailable") + strings.Contains(sErr, "rpc error: code = Unavailable") || + strings.Contains(sErr, "connection reset by peer") }, func() error { k := &unstructured.Unstructured{Object: map[string]interface{}{ "apiVersion": gvr.GroupVersion().String(),