Skip to content

Commit b3cf3cf

Browse files
authored
Implemented TargetAllocator resource deployments. (#208)
1 parent 0dce7f2 commit b3cf3cf

File tree

80 files changed

+21429
-8362
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+21429
-8362
lines changed

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ ARG AUTO_INSTRUMENTATION_DOTNET_VERSION
3030
ARG AUTO_INSTRUMENTATION_NODEJS_VERSION
3131
ARG DCMG_EXPORTER_VERSION
3232
ARG NEURON_MONITOR_VERSION
33+
ARG TARGET_ALLOCATOR_VERSION
3334

3435
# Build
35-
RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION}" -a -o manager main.go
36+
RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION} -X ${VERSION_PKG}.targetAllocator=${TARGET_ALLOCATOR_VERSION}" -a -o manager main.go
3637

3738
# Use distroless as minimal base image to package the manager binary
3839
# Refer to https://github.com/GoogleContainerTools/distroless for more details

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ AUTO_INSTRUMENTATION_DOTNET_VERSION ?= "$(shell grep -v '\#' versions.txt | grep
99
AUTO_INSTRUMENTATION_NODEJS_VERSION ?= "$(shell grep -v '\#' versions.txt | grep aws-otel-nodejs-instrumentation | awk -F= '{print $$2}')"
1010
DCGM_EXPORTER_VERSION ?= "$(shell grep -v '\#' versions.txt | grep dcgm-exporter | awk -F= '{print $$2}')"
1111
NEURON_MONITOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep neuron-monitor | awk -F= '{print $$2}')"
12-
TARGET_ALLOCATOR_VERSION ?= $(shell grep -v '\#' versions.txt | grep target-allocator | awk -F= '{print $$2}')
12+
TARGET_ALLOCATOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep target-allocator | awk -F= '{print $$2}')"
13+
1314
# Image URL to use all building/pushing image targets
1415
IMG_PREFIX ?= aws
1516
IMG_REPO ?= cloudwatch-agent-operator
@@ -162,7 +163,7 @@ generate: controller-gen api-docs
162163
# buildx is used to ensure same results for arm based systems (m1/2 chips)
163164
.PHONY: container
164165
container:
165-
docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} .
166+
docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} --build-arg TARGET_ALLOCATOR_VERSION=${TARGET_ALLOCATOR_VERSION} .
166167

167168
# Push the container image, used only for local dev purposes
168169
.PHONY: container-push
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package v1alpha1
5+
6+
type (
7+
// AmazonCloudWatchAgentTargetAllocatorAllocationStrategy represent which strategy to distribute target to each collector
8+
// +kubebuilder:validation:Enum=consistent-hashing
9+
AmazonCloudWatchAgentTargetAllocatorAllocationStrategy string
10+
)
11+
12+
const (
13+
// AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing targets will be consistently added to collectors, which allows a high-availability setup.
14+
AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing AmazonCloudWatchAgentTargetAllocatorAllocationStrategy = "consistent-hashing"
15+
)

apis/v1alpha1/amazoncloudwatchagent_types.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ type AmazonCloudWatchAgentSpec struct {
143143
// Collector and Target Allocator pods.
144144
// +optional
145145
PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
146+
// TargetAllocator indicates a value which determines whether to spawn a target allocation resource or not.
147+
// +optional
148+
TargetAllocator AmazonCloudWatchAgentTargetAllocator `json:"targetAllocator,omitempty"`
146149
// Mode represents how the collector should be deployed (deployment, daemonset, statefulset or sidecar)
147150
// +optional
148151
Mode Mode `json:"mode,omitempty"`
@@ -164,6 +167,9 @@ type AmazonCloudWatchAgentSpec struct {
164167
// ImagePullPolicy indicates the pull policy to be used for retrieving the container image (Always, Never, IfNotPresent)
165168
// +optional
166169
ImagePullPolicy v1.PullPolicy `json:"imagePullPolicy,omitempty"`
170+
// Prometheus is the raw YAML to be used as the collector's prometheus configuration.
171+
// +optional
172+
Prometheus PrometheusConfig `json:"prometheus,omitempty"`
167173
// Config is the raw JSON to be used as the collector's configuration. Refer to the OpenTelemetry Collector documentation for details.
168174
// +required
169175
Config string `json:"config,omitempty"`
@@ -273,6 +279,87 @@ type AmazonCloudWatchAgentSpec struct {
273279
UpdateStrategy appsv1.DaemonSetUpdateStrategy `json:"updateStrategy,omitempty"`
274280
}
275281

282+
// AmazonCloudWatchAgentTargetAllocator defines the configurations for the Prometheus target allocator.
283+
type AmazonCloudWatchAgentTargetAllocator struct {
284+
// Replicas is the number of pod instances for the underlying TargetAllocator. This should only be set to a value
285+
// other than 1 if a strategy that allows for high availability is chosen. Currently, the only allocation strategy
286+
// that can be run in a high availability mode is consistent-hashing.
287+
// +optional
288+
Replicas *int32 `json:"replicas,omitempty"`
289+
// NodeSelector to schedule OpenTelemetry TargetAllocator pods.
290+
// +optional
291+
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
292+
// Resources to set on the OpenTelemetryTargetAllocator containers.
293+
// +optional
294+
Resources v1.ResourceRequirements `json:"resources,omitempty"`
295+
// AllocationStrategy determines which strategy the target allocator should use for allocation.
296+
// The current option is consistent-hashing.
297+
// +optional
298+
AllocationStrategy AmazonCloudWatchAgentTargetAllocatorAllocationStrategy `json:"allocationStrategy,omitempty"`
299+
// FilterStrategy determines how to filter targets before allocating them among the collectors.
300+
// The only current option is relabel-config (drops targets based on prom relabel_config).
301+
// Filtering is disabled by default.
302+
// +optional
303+
FilterStrategy string `json:"filterStrategy,omitempty"`
304+
// ServiceAccount indicates the name of an existing service account to use with this instance. When set,
305+
// the operator will not automatically create a ServiceAccount for the TargetAllocator.
306+
// +optional
307+
ServiceAccount string `json:"serviceAccount,omitempty"`
308+
// Image indicates the container image to use for the OpenTelemetry TargetAllocator.
309+
// +optional
310+
Image string `json:"image,omitempty"`
311+
// Enabled indicates whether to use a target allocation mechanism for Prometheus targets or not.
312+
// +optional
313+
Enabled bool `json:"enabled,omitempty"`
314+
// If specified, indicates the pod's scheduling constraints
315+
// +optional
316+
Affinity *v1.Affinity `json:"affinity,omitempty"`
317+
// PrometheusCR defines the configuration for the retrieval of PrometheusOperator CRDs ( servicemonitor.monitoring.coreos.com/v1 and podmonitor.monitoring.coreos.com/v1 ) retrieval.
318+
// All CR instances which the ServiceAccount has access to will be retrieved. This includes other namespaces.
319+
// +optional
320+
PrometheusCR AmazonCloudWatchAgentTargetAllocatorPrometheusCR `json:"prometheusCR,omitempty"`
321+
// SecurityContext configures the container security context for
322+
// the target-allocator.
323+
// +optional
324+
SecurityContext *v1.PodSecurityContext `json:"securityContext,omitempty"`
325+
// TopologySpreadConstraints embedded kubernetes pod configuration option,
326+
// controls how pods are spread across your cluster among failure-domains
327+
// such as regions, zones, nodes, and other user-defined topology domains
328+
// https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
329+
// +optional
330+
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
331+
// Toleration embedded kubernetes pod configuration option,
332+
// controls how pods can be scheduled with matching taints
333+
// +optional
334+
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
335+
// ENV vars to set on the OpenTelemetry TargetAllocator's Pods. These can then in certain cases be
336+
// consumed in the config file for the TargetAllocator.
337+
// +optional
338+
Env []v1.EnvVar `json:"env,omitempty"`
339+
}
340+
341+
type AmazonCloudWatchAgentTargetAllocatorPrometheusCR struct {
342+
// Enabled indicates whether to use a PrometheusOperator custom resources as targets or not.
343+
// +optional
344+
Enabled bool `json:"enabled,omitempty"`
345+
// Interval between consecutive scrapes. Equivalent to the same setting on the Prometheus CRD.
346+
//
347+
// Default: "30s"
348+
// +kubebuilder:default:="30s"
349+
// +kubebuilder:validation:Format:=duration
350+
ScrapeInterval *metav1.Duration `json:"scrapeInterval,omitempty"`
351+
// PodMonitors to be selected for target discovery.
352+
// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
353+
// PodMonitor's meta labels. The requirements are ANDed.
354+
// +optional
355+
PodMonitorSelector map[string]string `json:"podMonitorSelector,omitempty"`
356+
// ServiceMonitors to be selected for target discovery.
357+
// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
358+
// ServiceMonitor's meta labels. The requirements are ANDed.
359+
// +optional
360+
ServiceMonitorSelector map[string]string `json:"serviceMonitorSelector,omitempty"`
361+
}
362+
276363
// ScaleSubresourceStatus defines the observed state of the AmazonCloudWatchAgent's
277364
// scale subresource.
278365
type ScaleSubresourceStatus struct {

apis/v1alpha1/collector_webhook.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ import (
1616
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
1717

1818
"github.com/aws/amazon-cloudwatch-agent-operator/internal/config"
19+
"github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/collector/adapters"
20+
ta "github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/targetallocator/adapters"
21+
"github.com/aws/amazon-cloudwatch-agent-operator/pkg/featuregate"
1922
)
2023

2124
var (
@@ -87,6 +90,9 @@ func (c CollectorWebhook) defaulter(r *AmazonCloudWatchAgent) error {
8790
if r.Spec.Replicas == nil {
8891
r.Spec.Replicas = &one
8992
}
93+
if r.Spec.TargetAllocator.Enabled && r.Spec.TargetAllocator.Replicas == nil {
94+
r.Spec.TargetAllocator.Replicas = &one
95+
}
9096

9197
if r.Spec.MaxReplicas != nil || (r.Spec.Autoscaler != nil && r.Spec.Autoscaler.MaxReplicas != nil) {
9298
if r.Spec.Autoscaler == nil {
@@ -163,6 +169,32 @@ func (c CollectorWebhook) validate(r *AmazonCloudWatchAgent) (admission.Warnings
163169
return warnings, fmt.Errorf("the OpenTelemetry Collector mode is set to %s, which does not support the attribute 'AdditionalContainers'", r.Spec.Mode)
164170
}
165171

172+
// validate target allocation
173+
if r.Spec.TargetAllocator.Enabled && r.Spec.Mode != ModeStatefulSet {
174+
return warnings, fmt.Errorf("the OpenTelemetry Collector mode is set to %s, which does not support the target allocation deployment", r.Spec.Mode)
175+
}
176+
177+
// validate Prometheus config for target allocation
178+
if r.Spec.TargetAllocator.Enabled {
179+
promConfigYaml, err := r.Spec.Prometheus.Yaml()
180+
if err != nil {
181+
return warnings, fmt.Errorf("%s could not convert json to yaml", err)
182+
}
183+
184+
promCfg, err := adapters.ConfigFromString(promConfigYaml)
185+
if err != nil {
186+
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
187+
}
188+
err = ta.ValidatePromConfig(promCfg, r.Spec.TargetAllocator.Enabled, featuregate.EnableTargetAllocatorRewrite.IsEnabled())
189+
if err != nil {
190+
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
191+
}
192+
err = ta.ValidateTargetAllocatorConfig(r.Spec.TargetAllocator.PrometheusCR.Enabled, promCfg)
193+
if err != nil {
194+
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
195+
}
196+
}
197+
166198
// validator port config
167199
for _, p := range r.Spec.Ports {
168200
nameErrs := validation.IsValidPortName(p.Name)

apis/v1alpha1/collector_webhook_test.go

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111

1212
"github.com/go-logr/logr"
1313
"github.com/stretchr/testify/assert"
14+
"github.com/stretchr/testify/require"
15+
"gopkg.in/yaml.v2"
1416
appsv1 "k8s.io/api/apps/v1"
1517
autoscalingv2 "k8s.io/api/autoscaling/v2"
1618
v1 "k8s.io/api/core/v1"
@@ -273,6 +275,7 @@ func TestOTELColDefaultingWebhook(t *testing.T) {
273275
scheme: testScheme,
274276
cfg: config.New(
275277
config.WithCollectorImage("collector:v0.0.0"),
278+
config.WithTargetAllocatorImage("ta:v0.0.0"),
276279
),
277280
}
278281
ctx := context.Background()
@@ -283,6 +286,12 @@ func TestOTELColDefaultingWebhook(t *testing.T) {
283286
}
284287
}
285288

289+
var promCfgYaml = `config:
290+
scrape_configs:
291+
- job_name: otel-collector
292+
scrape_interval: 10s
293+
`
294+
286295
// TODO: a lot of these tests use .Spec.MaxReplicas and .Spec.MinReplicas. These fields are
287296
// deprecated and moved to .Spec.Autoscaler. Fine to use these fields to test that old CRD is
288297
// still supported but should eventually be updated.
@@ -294,6 +303,10 @@ func TestOTELColValidatingWebhook(t *testing.T) {
294303
three := int32(3)
295304
five := int32(5)
296305

306+
promCfg := PrometheusConfig{}
307+
err := yaml.Unmarshal([]byte(promCfgYaml), &promCfg)
308+
require.NoError(t, err)
309+
297310
tests := []struct { //nolint:govet
298311
name string
299312
otelcol AmazonCloudWatchAgent
@@ -313,21 +326,10 @@ func TestOTELColValidatingWebhook(t *testing.T) {
313326
Replicas: &three,
314327
MaxReplicas: &five,
315328
UpgradeStrategy: "adhoc",
316-
Config: `receivers:
317-
examplereceiver:
318-
endpoint: "0.0.0.0:12345"
319-
examplereceiver/settings:
320-
endpoint: "0.0.0.0:12346"
321-
prometheus:
322-
config:
323-
scrape_configs:
324-
- job_name: otel-collector
325-
scrape_interval: 10s
326-
jaeger/custom:
327-
protocols:
328-
thrift_http:
329-
endpoint: 0.0.0.0:15268
330-
`,
329+
TargetAllocator: AmazonCloudWatchAgentTargetAllocator{
330+
Enabled: true,
331+
},
332+
Prometheus: promCfg,
331333
Ports: []v1.ServicePort{
332334
{
333335
Name: "port1",
@@ -373,6 +375,30 @@ func TestOTELColValidatingWebhook(t *testing.T) {
373375
},
374376
expectedErr: "does not support the attribute 'tolerations'",
375377
},
378+
{
379+
name: "invalid mode with target allocator",
380+
otelcol: AmazonCloudWatchAgent{
381+
Spec: AmazonCloudWatchAgentSpec{
382+
Mode: ModeDeployment,
383+
TargetAllocator: AmazonCloudWatchAgentTargetAllocator{
384+
Enabled: true,
385+
},
386+
},
387+
},
388+
expectedErr: "does not support the target allocation deployment",
389+
},
390+
{
391+
name: "invalid target allocator config",
392+
otelcol: AmazonCloudWatchAgent{
393+
Spec: AmazonCloudWatchAgentSpec{
394+
Mode: ModeStatefulSet,
395+
TargetAllocator: AmazonCloudWatchAgentTargetAllocator{
396+
Enabled: true,
397+
},
398+
},
399+
},
400+
expectedErr: "the OpenTelemetry Spec Prometheus configuration is incorrect",
401+
},
376402
{
377403
name: "invalid port name",
378404
otelcol: AmazonCloudWatchAgent{
@@ -755,6 +781,7 @@ func TestOTELColValidatingWebhook(t *testing.T) {
755781
scheme: testScheme,
756782
cfg: config.New(
757783
config.WithCollectorImage("collector:v0.0.0"),
784+
config.WithTargetAllocatorImage("ta:v0.0.0"),
758785
),
759786
}
760787
ctx := context.Background()

0 commit comments

Comments
 (0)