Skip to content

Commit 18ae471

Browse files
authored
feat: cluster metrics (#7)
Signed-off-by: Zbigniew Mandziejewicz <shaxbee@gmail.com>
1 parent 2cfb119 commit 18ae471

File tree

18 files changed

+896
-67
lines changed

18 files changed

+896
-67
lines changed

.github/workflows/ci.yaml

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ jobs:
5656
run: make lint
5757
- name: Integration test
5858
run: make integration-test coverage
59-
- name: Archive coverage report
60-
uses: actions/upload-artifact@v4
59+
- name: Upload coverage reports to Codecov
60+
uses: codecov/codecov-action@v5
6161
with:
62-
name: coverage
63-
path: ${{ env.CODECOV_FILE }}
64-
62+
token: ${{ secrets.CODECOV_TOKEN }}
63+
files: ${{ env.CODECOV_FILE }}
6564
e2e:
6665
name: E2E Test
67-
if: github.actor!= 'dependabot-preview[bot]'
66+
if: false
67+
# if: github.actor!= 'dependabot-preview[bot]'
6868
needs:
6969
- verify
7070
- image-build
@@ -108,25 +108,3 @@ jobs:
108108
with:
109109
name: coverage-e2e
110110
path: ${{ env.CODECOV_FILE }}
111-
112-
coverage:
113-
name: Coverage Report
114-
if: github.actor!= 'dependabot-preview[bot]'
115-
needs:
116-
- verify
117-
- e2e
118-
runs-on: ubuntu-latest
119-
steps:
120-
- name: Download coverage report
121-
uses: actions/download-artifact@v4
122-
with:
123-
name: coverage
124-
- name: Download E2E coverage report
125-
uses: actions/download-artifact@v4
126-
with:
127-
name: coverage-e2e
128-
- name: Upload coverage reports to Codecov
129-
uses: codecov/codecov-action@v5
130-
with:
131-
token: ${{ secrets.CODECOV_TOKEN }}
132-
files: coverage.xml,coverage.e2e.xml

cmd/etcd-operator/command.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ import (
44
"flag"
55
"fmt"
66
"os"
7-
"time"
87

8+
"github.com/agoda-com/etcd-operator/pkg/backup"
99
"github.com/spf13/cobra"
1010

1111
corev1 "k8s.io/api/core/v1"
@@ -25,15 +25,16 @@ const (
2525
)
2626

2727
type Config struct {
28-
Pod client.ObjectKey
29-
MetricsAddr string
30-
HealthProbeAddr string
31-
LeaderElection bool
32-
WatchNamespaces []string
33-
WatchSelector LabelSelector
34-
Image string
35-
ControllerImage string
36-
BackupRetention time.Duration
28+
Pod client.ObjectKey
29+
MetricsAddr string
30+
HealthProbeAddr string
31+
LeaderElection bool
32+
WatchNamespaces []string
33+
WatchSelector LabelSelector
34+
Image string
35+
ControllerImage string
36+
PriorityClassName string
37+
BackupEnv map[string]string
3738
}
3839

3940
func Command() *cobra.Command {
@@ -50,13 +51,15 @@ func Command() *cobra.Command {
5051
Namespace: os.Getenv("POD_NAMESPACE"),
5152
Name: os.Getenv("POD_NAME"),
5253
},
54+
BackupEnv: backup.LoadEnv(),
5355
}
5456
flags.StringSliceVar(&config.WatchNamespaces, "watch-namespaces", nil, "Namespaces to watch for resources.")
5557
flags.Var(&config.WatchSelector, "watch-selector", "Selector to watch for resources.")
58+
flags.StringVar(&config.PriorityClassName, "priority-class-name", "", "ETCD cluster pods priorityClassName")
59+
5660
flags.StringVar(&config.MetricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
5761
flags.StringVar(&config.HealthProbeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
5862
flags.BoolVar(&config.LeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
59-
flags.DurationVar(&config.BackupRetention, "backup-retention", 7*24*time.Hour, "How long to retain a backup objects.")
6063

6164
stdFlags := flag.NewFlagSet("etcd-operator", flag.ContinueOnError)
6265
zapOptions := &zap.Options{}

cmd/etcd-operator/main.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939

4040
"github.com/agoda-com/etcd-operator/pkg/cluster"
4141
"github.com/agoda-com/etcd-operator/pkg/etcd"
42+
"github.com/agoda-com/etcd-operator/pkg/metrics"
4243
)
4344

4445
func main() {
@@ -97,14 +98,25 @@ func run(ctx context.Context, logger logr.Logger, kubeconfig *rest.Config, confi
9798
return fmt.Errorf("tls cache: %w", err)
9899
}
99100

100-
err = cluster.CreateControllerWithManager(mgr, tlsCache, cluster.Config{
101-
Image: config.Image,
102-
ControllerImage: config.ControllerImage,
101+
err = cluster.SetupWithManager(mgr, tlsCache, cluster.Config{
102+
Image: config.Image,
103+
ControllerImage: config.ControllerImage,
104+
PriorityClassName: config.PriorityClassName,
105+
BackupEnv: config.BackupEnv,
103106
})
104107
if err != nil {
105108
return fmt.Errorf("cluster controller: %w", err)
106109
}
107110

111+
meterProvider, err := SetupTelemetry(ctx)
112+
if err != nil {
113+
return fmt.Errorf("metrics provider: %w", err)
114+
}
115+
err = metrics.SetupWithManager(mgr, meterProvider)
116+
if err != nil {
117+
return fmt.Errorf("metrics controller: %w", err)
118+
}
119+
108120
err = mgr.AddHealthzCheck("ping", healthz.Ping)
109121
if err != nil {
110122
return err

cmd/etcd-operator/telemetry.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"os"
6+
"time"
7+
8+
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
9+
"go.opentelemetry.io/otel/metric"
10+
"go.opentelemetry.io/otel/metric/noop"
11+
metricsdk "go.opentelemetry.io/otel/sdk/metric"
12+
"sigs.k8s.io/controller-runtime/pkg/log"
13+
)
14+
15+
func SetupTelemetry(ctx context.Context) (metric.MeterProvider, error) {
16+
logger := log.FromContext(ctx).WithName("metrics")
17+
18+
endpoint := os.Getenv("OTEL_EXPORTER_OLTP_ENDPOINT")
19+
if endpoint != "" {
20+
exporter, err := otlpmetricgrpc.New(ctx, otlpmetricgrpc.WithEndpoint(endpoint))
21+
if err != nil {
22+
return nil, err
23+
}
24+
25+
provider := metricsdk.NewMeterProvider(
26+
metricsdk.WithReader(metricsdk.NewPeriodicReader(exporter)),
27+
)
28+
29+
logger.Info("enabled otlp grpc metrics", "endpoint", endpoint)
30+
31+
go func() {
32+
<-ctx.Done()
33+
34+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
35+
defer cancel()
36+
37+
err := provider.Shutdown(ctx)
38+
if err != nil {
39+
logger.Error(err, "shutdown metrics")
40+
}
41+
}()
42+
43+
return provider, nil
44+
}
45+
46+
// fallback on noop provider if endpoint is not configured
47+
return noop.NewMeterProvider(), nil
48+
}

cmd/go.mod

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ require (
4343
github.com/aws/smithy-go v1.22.2 // indirect
4444
github.com/beorn7/perks v1.0.1 // indirect
4545
github.com/blang/semver/v4 v4.0.0 // indirect
46+
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
4647
github.com/cespare/xxhash/v2 v2.3.0 // indirect
4748
github.com/coreos/go-semver v0.3.1 // indirect
4849
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
@@ -68,6 +69,7 @@ require (
6869
github.com/google/go-cmp v0.7.0 // indirect
6970
github.com/google/gofuzz v1.2.0 // indirect
7071
github.com/google/uuid v1.6.0 // indirect
72+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 // indirect
7173
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
7274
github.com/imdario/mergo v0.3.16 // indirect
7375
github.com/inconshreveable/mousetrap v1.1.0 // indirect
@@ -97,9 +99,12 @@ require (
9799
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
98100
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 // indirect
99101
go.opentelemetry.io/otel v1.35.0 // indirect
102+
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.35.0 // indirect
100103
go.opentelemetry.io/otel/metric v1.35.0 // indirect
101104
go.opentelemetry.io/otel/sdk v1.35.0 // indirect
105+
go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect
102106
go.opentelemetry.io/otel/trace v1.35.0 // indirect
107+
go.opentelemetry.io/proto/otlp v1.5.0 // indirect
103108
go.uber.org/multierr v1.11.0 // indirect
104109
golang.org/x/crypto v0.36.0 // indirect
105110
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect

cmd/go.sum

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
4444
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
4545
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
4646
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
47+
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
48+
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
4749
github.com/cert-manager/cert-manager v1.15.0 h1:xVL8tzdQECMypoYQa9rv4DLjkn2pJXJLTqH4JUsxfko=
4850
github.com/cert-manager/cert-manager v1.15.0/go.mod h1:Vxq6yNKAbgQeMtzu5gqU8n0vXDiZcGTa5LDyCJRbmXE=
4951
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
@@ -116,6 +118,9 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
116118
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
117119
github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+HVt/4epWDjd4=
118120
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
121+
github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo=
122+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 h1:e9Rjr40Z98/clHv5Yg79Is0NtosR5LXRvdr7o/6NwbA=
123+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1/go.mod h1:tIxuGz/9mpox++sgp9fJjHO0+q1X9/UOWd798aAm22M=
119124
github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
120125
github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8=
121126
github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
@@ -229,6 +234,8 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.5
229234
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0/go.mod h1:azvtTADFQJA8mX80jIH/akaE7h+dbm/sVuaHqN13w74=
230235
go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ=
231236
go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y=
237+
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.35.0 h1:QcFwRrZLc82r8wODjvyCbP7Ifp3UANaBSmhDSFjnqSc=
238+
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.35.0/go.mod h1:CXIWhUomyWBG/oY2/r/kLp6K/cmx9e/7DLpBuuGdLCA=
232239
go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M=
233240
go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE=
234241
go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY=
@@ -237,6 +244,8 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J
237244
go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w=
238245
go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs=
239246
go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc=
247+
go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4=
248+
go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4=
240249
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
241250
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
242251
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=

e2e/cluster_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ func TestCluster(t *testing.T) {
2626
})
2727

2828
t.Run("scale", func(t *testing.T) {
29-
scale(t, kcl, cluster, 5)
30-
poll(t, kcl, cluster, 2*time.Minute, available)
29+
Scale(t, kcl, cluster, 5)
30+
Poll(t, kcl, cluster, 2*time.Minute, Available)
3131

3232
t.Logf("cluster %q scaled up to 5 members", cluster.Name)
3333

34-
scale(t, kcl, cluster, 3)
35-
poll(t, kcl, cluster, time.Minute, available)
34+
Scale(t, kcl, cluster, 3)
35+
Poll(t, kcl, cluster, time.Minute, Available)
3636

3737
t.Logf("cluster %q scaled down to 3 members", cluster.Name)
3838
})
@@ -60,8 +60,8 @@ func TestCluster(t *testing.T) {
6060
}
6161
t.Log("evict pod", key)
6262

63-
poll(t, kcl, cluster, 2*time.Minute, func(cluster *apiv1.EtcdCluster) bool {
64-
if !available(cluster) {
63+
Poll(t, kcl, cluster, 2*time.Minute, func(cluster *apiv1.EtcdCluster) bool {
64+
if !Available(cluster) {
6565
return false
6666
}
6767

@@ -85,7 +85,7 @@ func TestCluster(t *testing.T) {
8585
}
8686

8787
version = strings.TrimPrefix(version, "v")
88-
poll(t, kcl, cluster, 5*time.Minute, func(cluster *apiv1.EtcdCluster) bool {
88+
Poll(t, kcl, cluster, 5*time.Minute, func(cluster *apiv1.EtcdCluster) bool {
8989
if !conditions.StatusTrue(cluster.Status.Conditions, apiv1.ClusterAvailable) {
9090
return false
9191
}

e2e/kube_test.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
autoscalingv1 "k8s.io/api/autoscaling/v1"
1717
batchv1 "k8s.io/api/batch/v1"
1818
corev1 "k8s.io/api/core/v1"
19+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1920
"k8s.io/apimachinery/pkg/api/resource"
2021
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2122
"k8s.io/apimachinery/pkg/runtime"
@@ -147,14 +148,14 @@ func createCluster(t testing.TB, kcl client.Client, timeout time.Duration, spec
147148
key := client.ObjectKeyFromObject(cluster)
148149
t.Logf("cluster %q created", key)
149150

150-
poll(t, kcl, cluster, timeout, available)
151+
Poll(t, kcl, cluster, timeout, Available)
151152

152153
t.Logf("cluster %q available", key)
153154

154155
return cluster
155156
}
156157

157-
func poll[T client.Object](t testing.TB, kcl client.Client, obj T, timeout time.Duration, f func(obj T) bool) {
158+
func Poll[T client.Object](t testing.TB, kcl client.Client, obj T, timeout time.Duration, f func(obj T) bool) {
158159
t.Helper()
159160

160161
ctx, cancel := context.WithTimeout(t.Context(), timeout)
@@ -173,11 +174,11 @@ func poll[T client.Object](t testing.TB, kcl client.Client, obj T, timeout time.
173174
}
174175
}
175176

176-
func available(cluster *apiv1.EtcdCluster) bool {
177+
func Available(cluster *apiv1.EtcdCluster) bool {
177178
return cluster.Status.AvailableReplicas == cluster.Spec.Replicas
178179
}
179180

180-
func scale(t testing.TB, kcl client.Client, obj client.Object, replicas int32) {
181+
func Scale(t testing.TB, kcl client.Client, obj client.Object, replicas int32) {
181182
scale := &autoscalingv1.Scale{}
182183
err := kcl.SubResource("scale").Get(t.Context(), obj, scale)
183184
if err != nil {
@@ -226,7 +227,11 @@ func triggerCronJob(t testing.TB, kcl client.Client, key client.ObjectKey, timeo
226227
t.Logf("created job %q", client.ObjectKeyFromObject(job))
227228

228229
err = wait.PollUntilContextCancel(ctx, 500*time.Millisecond, true, func(ctx context.Context) (done bool, err error) {
229-
if err := kcl.Get(ctx, client.ObjectKeyFromObject(job), job); err != nil {
230+
err = kcl.Get(ctx, client.ObjectKeyFromObject(job), job)
231+
switch {
232+
case apierrors.IsTooManyRequests(err):
233+
return false, err
234+
case err != nil:
230235
return true, fmt.Errorf("get backup job: %w", err)
231236
}
232237

0 commit comments

Comments
 (0)