diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index e79d9c4b8..a794802cc 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -1344,6 +1344,7 @@ spec: - prometheus - influxdb - datadog + - externalmetrics - stackdriver - cloudwatch - newrelic diff --git a/charts/flagger/crds/crd.yaml b/charts/flagger/crds/crd.yaml index e79d9c4b8..a794802cc 100644 --- a/charts/flagger/crds/crd.yaml +++ b/charts/flagger/crds/crd.yaml @@ -1344,6 +1344,7 @@ spec: - prometheus - influxdb - datadog + - externalmetrics - stackdriver - cloudwatch - newrelic diff --git a/charts/flagger/templates/rbac.yaml b/charts/flagger/templates/rbac.yaml index 8e833c6cd..d2aa69849 100644 --- a/charts/flagger/templates/rbac.yaml +++ b/charts/flagger/templates/rbac.yaml @@ -289,6 +289,14 @@ rules: - revisions verbs: - get + - apiGroups: + - external.metrics.k8s.io + resources: + - '*' + verbs: + - get + - watch + - list --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/docs/gitbook/usage/metrics.md b/docs/gitbook/usage/metrics.md index 58b243ef0..3a1d17506 100644 --- a/docs/gitbook/usage/metrics.md +++ b/docs/gitbook/usage/metrics.md @@ -326,6 +326,22 @@ Reference the template in the canary analysis: interval: 1m ``` +### Datadog Rate Limits + +For bigger setups, you might run into rate limits on the Datadog API. To avoid this, +you can use the Datadog Cluster Agent to retrieve metrics in batches instead. It will then +expose these metrics as an external metrics server. + +See [Datadog Documentation](https://docs.datadoghq.com/containers/guide/cluster_agent_autoscaling_metrics). + +Once you have enabled Datadog's external metrics endpoint and `DatadogMetric` CRD (without +necessarily using `registerAPIService`), you can use Flagger's +[External Metrics Provider](#kubernetes-external-metrics) to query the metrics from there. + +The server address is usually `datadog-cluster-agent-metrics-server` and exposed on port 8443. +ExternalMetrics will be named as `datadogmetric@:`, for example +`datadogmetric@istio-system:istio-mesh-request-count`. + ## Amazon CloudWatch You can create custom metric checks using the CloudWatch metrics provider. @@ -781,3 +797,37 @@ Reference the template in the canary analysis: max: 99 interval: 1m ``` + +## Kubernetes External Metrics + +You can query an external metrics provider that implements the +[Kubernetes External Metrics API](https://kubernetes.io/docs/reference/external-api/external-metrics.v1beta1/). + +By default, Flagger will use its bound Service Account for authentication. *Optionally* you can provide a Bearer token through a Secret (that must contain a field named `token`) : + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: external-metric-server-token + namespace: default +data: + token: your-access-token +``` + +External Metrics template example: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: my-external-metric + namespace: default +spec: + provider: + type: externalmetrics + address: https://external-metrics-server.default.svc.cluster.local:8443 + secretRef: # Optional + name: external-metric-server-token + query: webapp-frontend/job-success-rate?labelSelector=env%3Dproduction +``` \ No newline at end of file diff --git a/go.mod b/go.mod index 56bf27a79..ac47c7cd8 100644 --- a/go.mod +++ b/go.mod @@ -24,11 +24,13 @@ require ( google.golang.org/grpc v1.76.0 google.golang.org/protobuf v1.36.10 gopkg.in/h2non/gock.v1 v1.1.2 - k8s.io/api v0.34.1 - k8s.io/apimachinery v0.34.1 - k8s.io/client-go v0.34.1 - k8s.io/code-generator v0.34.1 + gopkg.in/inf.v0 v0.9.1 + k8s.io/api v0.34.2 + k8s.io/apimachinery v0.34.2 + k8s.io/client-go v0.34.2 + k8s.io/code-generator v0.34.2 k8s.io/klog/v2 v2.130.1 + k8s.io/metrics v0.34.2 k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 knative.dev/serving v0.46.6 ) @@ -96,7 +98,6 @@ require ( google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251002232023-7c0ddcbb5797 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f // indirect k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect diff --git a/go.sum b/go.sum index 1f02c0eef..ac6ad3cfd 100644 --- a/go.sum +++ b/go.sum @@ -275,20 +275,22 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= -k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= -k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= -k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= -k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= -k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= -k8s.io/code-generator v0.34.1 h1:WpphT26E+j7tEgIUfFr5WfbJrktCGzB3JoJH9149xYc= -k8s.io/code-generator v0.34.1/go.mod h1:DeWjekbDnJWRwpw3s0Jat87c+e0TgkxoR4ar608yqvg= +k8s.io/api v0.34.2 h1:fsSUNZhV+bnL6Aqrp6O7lMTy6o5x2C4XLjnh//8SLYY= +k8s.io/api v0.34.2/go.mod h1:MMBPaWlED2a8w4RSeanD76f7opUoypY8TFYkSM+3XHw= +k8s.io/apimachinery v0.34.2 h1:zQ12Uk3eMHPxrsbUJgNF8bTauTVR2WgqJsTmwTE/NW4= +k8s.io/apimachinery v0.34.2/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/client-go v0.34.2 h1:Co6XiknN+uUZqiddlfAjT68184/37PS4QAzYvQvDR8M= +k8s.io/client-go v0.34.2/go.mod h1:2VYDl1XXJsdcAxw7BenFslRQX28Dxz91U9MWKjX97fE= +k8s.io/code-generator v0.34.2 h1:9bG6jTxmsU3HXE5BNYJTC8AZ1D6hVVfkm8yYSkdkGY0= +k8s.io/code-generator v0.34.2/go.mod h1:dnDDEd6S/z4uZ+PG1aE58ySCi/lR4+qT3a4DddE4/2I= k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f h1:SLb+kxmzfA87x4E4brQzB33VBbT2+x7Zq9ROIHmGn9Q= k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= +k8s.io/metrics v0.34.2 h1:zao91FNDVPRGIiHLO2vqqe21zZVPien1goyzn0hsz90= +k8s.io/metrics v0.34.2/go.mod h1:Ydulln+8uZZctUM8yrUQX4rfq/Ay6UzsuXf24QJ37Vc= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= knative.dev/networking v0.0.0-20250902160145-7dad473f6351 h1:Gv/UqbN0AK+ORoT5e2Kg+3+uMW/y9CCdhpXKxYaVV6k= diff --git a/kustomize/base/flagger/crd.yaml b/kustomize/base/flagger/crd.yaml index e79d9c4b8..a794802cc 100644 --- a/kustomize/base/flagger/crd.yaml +++ b/kustomize/base/flagger/crd.yaml @@ -1344,6 +1344,7 @@ spec: - prometheus - influxdb - datadog + - externalmetrics - stackdriver - cloudwatch - newrelic diff --git a/pkg/metrics/providers/externalmetrics.go b/pkg/metrics/providers/externalmetrics.go new file mode 100644 index 000000000..e1a5e7360 --- /dev/null +++ b/pkg/metrics/providers/externalmetrics.go @@ -0,0 +1,162 @@ +/* +Copyright 2020 The Flux authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package providers + +import ( + "fmt" + "net/url" + "strings" + "time" + + flaggerv1 "github.com/fluxcd/flagger/pkg/apis/flagger/v1beta1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/rest" + externalmetrics_client "k8s.io/metrics/pkg/client/external_metrics" +) + +// ExternalMetricsProvider fetches metrics from an ExternalMetricsProvider. +type ExternalMetricsProvider struct { + timeout time.Duration + client externalmetrics_client.NamespacedMetricsGetter +} + +// NewExternalMetricsProvider takes a canary spec, a provider spec, and +// returns a client ready to execute queries against the Service. +func NewExternalMetricsProvider( + provider flaggerv1.MetricTemplateProvider, + credentials map[string][]byte) (*ExternalMetricsProvider, error) { + return newExternalMetricsProviderWithBuilder( + provider, credentials, rest.InClusterConfig, + ) +} + +// newExternalMetricsProviderWithBuilder is like NewExternalMetricsProvider but +// accepts a rest.Config builder function. Used for testing as InClusterConfig is hard to mock +func newExternalMetricsProviderWithBuilder( + provider flaggerv1.MetricTemplateProvider, + credentials map[string][]byte, + configBuilder func() (*rest.Config, error), +) (*ExternalMetricsProvider, error) { + restConfig, err := configBuilder() + if err != nil || restConfig == nil { + return nil, fmt.Errorf("Not in a kubernetes cluster: %w", err) + } + + // Handling overrides from MetricTemplateProvider + if provider.Address != "" { + restConfig.Host = provider.Address + } + restConfig.TLSClientConfig = rest.TLSClientConfig{ + Insecure: provider.InsecureSkipVerify, + } + if tokenBytes, ok := credentials["token"]; ok { + restConfig.BearerToken = string(tokenBytes) + } + // TODO: handle user name/password auth if needed + + client, err := externalmetrics_client.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("error creating external metric client: %w", err) + } + + return &ExternalMetricsProvider{ + timeout: 5 * time.Second, + client: client, + }, nil +} + +// RunQuery retrieves the ExternalMetricValue from the External Metrics API +// at the ExternalMetricsProvider's Address, using the provided query string, +// and returns the *first* result as a float64. +func (p *ExternalMetricsProvider) RunQuery(query string) (float64, error) { + // The Provider interface only allows a plain string query so decode it + namespace, metricName, selector, err := parseExternalMetricsQuery(query) + if err != nil { + return 0, fmt.Errorf("error parsing metric query: %w", err) + } + + nm := p.client.NamespacedMetrics(namespace) + metricsList, err := nm.List(metricName, selector) + if err != nil { + return 0, fmt.Errorf("error querying external metrics API: %w", err) + } + + if len(metricsList.Items) < 1 { + return 0, fmt.Errorf("no external metrics found: %w", ErrNoValuesFound) + } + + vs := metricsList.Items[0].Value.AsApproximateFloat64() + + return vs, nil +} + +// IsOnline tests that the External Metrics API is reachable by looking for dummy metrics. +// If we don't get a network error, we assume the service is online. +func (p *ExternalMetricsProvider) IsOnline() (bool, error) { + nm := p.client.NamespacedMetrics("kube-system") + _, err := nm.List("dummy-metric", labels.Everything()) + + if err != nil { + return false, fmt.Errorf("external metrics service unavailable: %w", err) + } + return true, nil +} + +// parseExternalMetricsQuery parses a query string in the format: +// /?labelSelector= +// where only the metricName is required. +// and returns the namespace, metricName, and labelSelector separately. +func parseExternalMetricsQuery(query string) (namespace string, metricName string, labelSelector labels.Selector, err error) { + u, err := url.Parse("dummy:///" + query) + if err != nil { + return "", "", labels.Everything(), fmt.Errorf("malformed query string, expected /?labelSelector=, got %s", query) + } + path := strings.TrimPrefix(u.Path, "/") + parts := strings.Split(path, "/") + if len(parts) > 2 { + return "", "", labels.Everything(), fmt.Errorf("malformed query string, too many slashes, expected /?labelSelector=, got %s", query) + } + + namespace = "default" + switch len(parts) { + case 1: + // Format: "metric" + metricName = parts[0] + case 2: + // Format: "namespace/metric" or "/metric" + if parts[0] != "" { + namespace = parts[0] + } + metricName = parts[1] + } + if metricName == "" { + return "", "", labels.Everything(), fmt.Errorf("metric name cannot be empty") + } + + qp := u.Query() + rawSelector := qp.Get("labelSelector") + if rawSelector == "" { + labelSelector = labels.Everything() + } else { + labelSelector, err = labels.Parse(rawSelector) + if err != nil { + return "", "", labels.Everything(), fmt.Errorf("error parsing label selector from string %s: %w", rawSelector, err) + } + } + + return namespace, metricName, labelSelector, nil +} diff --git a/pkg/metrics/providers/externalmetrics_test.go b/pkg/metrics/providers/externalmetrics_test.go new file mode 100644 index 000000000..4f24de5e3 --- /dev/null +++ b/pkg/metrics/providers/externalmetrics_test.go @@ -0,0 +1,229 @@ +/* +Copyright 2020 The Flux authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package providers + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/inf.v0" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + + flaggerv1 "github.com/fluxcd/flagger/pkg/apis/flagger/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8stesting "k8s.io/client-go/testing" + emv1beta1 "k8s.io/metrics/pkg/apis/external_metrics/v1beta1" + fakeemc "k8s.io/metrics/pkg/client/external_metrics/fake" +) + +const ( + testMetricName = "myMetric" + testMetricNamespace = "default" + testMetricServerAddress = "https://external-metrics.default.svc.cluster.local" + testQuery = "default/myMetric?labelSelector=label1%3Dvalue1" +) + +var ( + testMetricLabels = [...]string{"label1"} + testMetricLabelsValues = [...]string{"value1"} + // 11111e-4 = 1.1111 + testMetricValue = resource.NewDecimalQuantity(*inf.NewDec(11111, 4), resource.DecimalSI) +) + +func TestExternalMetrics_NewProvider(t *testing.T) { + tests := []struct { + name string + Address string + InsecureSkipVerify bool + creds map[string][]byte + builderFunc func() (*rest.Config, error) + wantErr bool + }{ + { + name: "Custom provider address and token", + Address: testMetricServerAddress, + InsecureSkipVerify: false, + creds: map[string][]byte{ + "token": []byte("test-token"), + }, + builderFunc: func() (*rest.Config, error) { return &rest.Config{}, nil }, + wantErr: false, + }, + { + name: "In cluster, automatic address and token", + Address: "", + InsecureSkipVerify: true, + creds: map[string][]byte{}, + builderFunc: func() (*rest.Config, error) { + return &rest.Config{ + Host: "https://kubernetes.default.svc", + BearerToken: "fake-token", + TLSClientConfig: rest.TLSClientConfig{Insecure: true}, + }, nil + }, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mtp := flaggerv1.MetricTemplateProvider{ + Address: tt.Address, + InsecureSkipVerify: tt.InsecureSkipVerify, + } + emp, err := newExternalMetricsProviderWithBuilder(mtp, tt.creds, tt.builderFunc) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + require.NotNil(t, emp) + assert.Equal(t, 5*time.Second, emp.timeout) + }) + } +} + +func TestExternalMetrics_ParseQuery(t *testing.T) { + tests := []struct { + name string + query string + wantNamespace string + wantMetricName string + wantLabelSelector string + wantErr bool + }{ + { + name: "General case", + query: testQuery, + wantNamespace: testMetricNamespace, + wantMetricName: testMetricName, + wantLabelSelector: labels.Set{testMetricLabels[0]: testMetricLabelsValues[0]}.AsSelector().String(), + wantErr: false, + }, + { + name: "Still OK without labelSelector", + query: testQuery[:strings.Index(testQuery, "?")], + wantNamespace: testMetricNamespace, + wantMetricName: testMetricName, + wantLabelSelector: labels.Everything().String(), + wantErr: false, + }, + { + name: "No namespace uses default", + query: "/metric_only", + wantNamespace: "default", + wantMetricName: "metric_only", + wantLabelSelector: labels.Everything().String(), + wantErr: false, + }, + { + name: "Missing metric name - namespaceonly", + query: "namespaceonly/", + wantErr: true, + }, + { + name: "Missing metric name - slash only", + query: "/", + wantErr: true, + }, + { + name: "Missing metric name - empty", + query: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotNamespace, gotMetricName, gotLabelSelector, err := parseExternalMetricsQuery(tt.query) + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + assert.Equal(t, tt.wantNamespace, gotNamespace) + assert.Equal(t, tt.wantMetricName, gotMetricName) + assert.Equal(t, tt.wantLabelSelector, gotLabelSelector.String()) + } + }) + } +} + +func TestExternalMetrics_RunQuery(t *testing.T) { + fakeExternalMetricsClient := fakeemc.FakeExternalMetricsClient{} + fakeExternalMetricsClient.Fake.AddReactor("list", "*", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, &emv1beta1.ExternalMetricValueList{ + Items: []emv1beta1.ExternalMetricValue{ + { + MetricName: testMetricName, + Value: *testMetricValue, + MetricLabels: map[string]string{ + testMetricLabels[0]: testMetricLabelsValues[0], + }, + Timestamp: metav1.Now(), + }, + }, + }, nil + }) + + emp := &ExternalMetricsProvider{ + timeout: 5 * time.Second, + client: &fakeExternalMetricsClient, + } + + tests := []struct { + name string + query string + }{ + { + name: "Full query with label selector", + query: testQuery, + }, + { + name: "Namespace and metric only", + query: "namespace/" + testMetricName, + }, + { + name: "Metric only, default namespace", + query: testMetricName, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f, err := emp.RunQuery(tt.query) + require.NoError(t, err) + assert.Equal(t, testMetricValue.AsApproximateFloat64(), f) + }) + } +} + +func TestExternalMetrics_IsOnline(t *testing.T) { + emp := &ExternalMetricsProvider{ + timeout: 5 * time.Second, + client: &fakeemc.FakeExternalMetricsClient{}, + } + + online, err := emp.IsOnline() + require.NoError(t, err) + assert.True(t, online) +} diff --git a/pkg/metrics/providers/factory.go b/pkg/metrics/providers/factory.go index e49e44c56..556d1c5b3 100644 --- a/pkg/metrics/providers/factory.go +++ b/pkg/metrics/providers/factory.go @@ -29,6 +29,8 @@ func (factory Factory) Provider(metricInterval string, provider flaggerv1.Metric return NewPrometheusProvider(provider, credentials) case "datadog": return NewDatadogProvider(metricInterval, provider, credentials) + case "externalmetrics": + return NewExternalMetricsProvider(provider, credentials) case "cloudwatch": return NewCloudWatchProvider(metricInterval, provider) case "newrelic":