Skip to content

Commit 7ced162

Browse files
committed
2 parents 6bcf0f4 + 6092786 commit 7ced162

File tree

66 files changed

+5962
-5422
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+5962
-5422
lines changed

.github/instructions/githubactions.instructions.md

Lines changed: 654 additions & 0 deletions
Large diffs are not rendered by default.

.github/instructions/golang.instructions.md

Lines changed: 373 additions & 0 deletions
Large diffs are not rendered by default.

CODEOWNERS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
/docs/ @kyma-project/technical-writers
55
/docs/user/integration/sample-app @kyma-project/observability
66

7+
78
# All .md files
89
*.md @kyma-project/technical-writers
910

11+
# provide a more specific rule for the /.github directory, to remove @kyma-project/technical-writers ownership
12+
/.github/**/*.md @kyma-project/observability
13+
1014
# JSON, NDJSON files
1115
*.ndjson @kyma-project/observability
1216
*.json @kyma-project/observability

Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,18 @@ docker-build: ## Build docker image with the manager
219219
docker-push: ## Push docker image with the manager
220220
docker push ${MANAGER_IMAGE}
221221

222+
.PHONY: docker-build-selfmonitor
223+
docker-build-selfmonitor: ## Build docker image for telemetry self-monitor
224+
@set -a && . dependencies/telemetry-self-monitor/envs && set +a && \
225+
docker build -t ${SELF_MONITOR_IMAGE} \
226+
--build-arg ALPINE_VERSION=$${ALPINE_VERSION} \
227+
--build-arg PROMETHEUS_VERSION=$${PROMETHEUS_VERSION} \
228+
dependencies/telemetry-self-monitor
229+
230+
.PHONY: docker-push-selfmonitor
231+
docker-push-selfmonitor: ## Push docker image for telemetry self-monitor
232+
docker push ${SELF_MONITOR_IMAGE}
233+
222234
##@ Development
223235

224236
.PHONY: run

controllers/telemetry/logpipeline_controller.go

Lines changed: 49 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,9 @@ func NewLogPipelineController(config LogPipelineControllerConfig, client client.
126126

127127
reconciler := logpipeline.New(
128128
client,
129-
overrides.New(config.Global, client),
130-
pipelineSyncer,
131-
fluentBitReconciler,
132-
otelReconciler,
129+
logpipeline.WithOverridesHandler(overrides.New(config.Global, client)),
130+
logpipeline.WithPipelineSyncer(pipelineSyncer),
131+
logpipeline.WithReconcilers(fluentBitReconciler, otelReconciler),
133132
)
134133

135134
return &LogPipelineController{
@@ -195,12 +194,12 @@ func (r *LogPipelineController) mapTelemetryChanges(ctx context.Context, object
195194
}
196195

197196
func configureFluentBitReconciler(config LogPipelineControllerConfig, client client.Client, flowHealthProber *prober.FluentBitProber, pipelineLock logpipelinefluentbit.PipelineLock) (*logpipelinefluentbit.Reconciler, error) {
198-
pipelineValidator := &logpipelinefluentbit.Validator{
199-
EndpointValidator: &endpoint.Validator{Client: client},
200-
TLSCertValidator: tlscert.New(client),
201-
SecretRefValidator: &secretref.Validator{Client: client},
202-
PipelineLock: pipelineLock,
203-
}
197+
pipelineValidator := logpipelinefluentbit.NewValidator(
198+
logpipelinefluentbit.WithEndpointValidator(&endpoint.Validator{Client: client}),
199+
logpipelinefluentbit.WithTLSCertValidator(tlscert.New(client)),
200+
logpipelinefluentbit.WithSecretRefValidator(&secretref.Validator{Client: client}),
201+
logpipelinefluentbit.WithValidatorPipelineLock(pipelineLock),
202+
)
204203

205204
fluentBitApplierDeleter := fluentbit.NewFluentBitApplierDeleter(
206205
config.TargetNamespace(),
@@ -218,16 +217,19 @@ func configureFluentBitReconciler(config LogPipelineControllerConfig, client cli
218217
}
219218

220219
fbReconciler := logpipelinefluentbit.New(
221-
config.Global,
222-
client,
223-
fluentBitConfigBuilder,
224-
fluentBitApplierDeleter,
225-
&workloadstatus.DaemonSetProber{Client: client},
226-
flowHealthProber,
227-
istiostatus.NewChecker(discoveryClient),
228-
pipelineLock,
229-
pipelineValidator,
230-
&conditions.ErrorToMessageConverter{})
220+
logpipelinefluentbit.WithClient(client),
221+
logpipelinefluentbit.WithGlobals(config.Global),
222+
223+
logpipelinefluentbit.WithAgentApplierDeleter(fluentBitApplierDeleter),
224+
logpipelinefluentbit.WithAgentConfigBuilder(fluentBitConfigBuilder),
225+
logpipelinefluentbit.WithAgentProber(&workloadstatus.DaemonSetProber{Client: client}),
226+
227+
logpipelinefluentbit.WithErrorToMessageConverter(&conditions.ErrorToMessageConverter{}),
228+
logpipelinefluentbit.WithFlowHealthProber(flowHealthProber),
229+
logpipelinefluentbit.WithIstioStatusChecker(istiostatus.NewChecker(discoveryClient)),
230+
logpipelinefluentbit.WithPipelineLock(pipelineLock),
231+
logpipelinefluentbit.WithPipelineValidator(pipelineValidator),
232+
)
231233

232234
return fbReconciler, nil
233235
}
@@ -244,14 +246,14 @@ func configureOTelReconciler(config LogPipelineControllerConfig, client client.C
244246
return nil, err
245247
}
246248

247-
pipelineValidator := &logpipelineotel.Validator{
248-
PipelineLock: pipelineLock,
249-
EndpointValidator: &endpoint.Validator{Client: client},
250-
TLSCertValidator: tlscert.New(client),
251-
SecretRefValidator: &secretref.Validator{Client: client},
252-
TransformSpecValidator: transformSpecValidator,
253-
FilterSpecValidator: filterSpecValidator,
254-
}
249+
pipelineValidator := logpipelineotel.NewValidator(
250+
logpipelineotel.WithValidatorPipelineLock(pipelineLock),
251+
logpipelineotel.WithEndpointValidator(&endpoint.Validator{Client: client}),
252+
logpipelineotel.WithTLSCertValidator(tlscert.New(client)),
253+
logpipelineotel.WithSecretRefValidator(&secretref.Validator{Client: client}),
254+
logpipelineotel.WithTransformSpecValidator(transformSpecValidator),
255+
logpipelineotel.WithFilterSpecValidator(filterSpecValidator),
256+
)
255257

256258
discoveryClient, err := discovery.NewDiscoveryClientForConfig(config.RestConfig)
257259
if err != nil {
@@ -263,20 +265,25 @@ func configureOTelReconciler(config LogPipelineControllerConfig, client client.C
263265
}
264266

265267
otelReconciler := logpipelineotel.New(
266-
config.Global,
267-
client,
268-
gatewayFlowHealthProber,
269-
agentFlowHealthProber,
270-
agentConfigBuilder,
271-
otelcollector.NewLogAgentApplierDeleter(config.Global, config.OTelCollectorImage, config.LogAgentPriorityClassName),
272-
&workloadstatus.DaemonSetProber{Client: client},
273-
otelcollector.NewLogGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.LogGatewayPriorityClassName),
274-
&loggateway.Builder{Reader: client},
275-
&workloadstatus.DeploymentProber{Client: client},
276-
istiostatus.NewChecker(discoveryClient),
277-
pipelineLock,
278-
pipelineValidator,
279-
&conditions.ErrorToMessageConverter{})
268+
logpipelineotel.WithClient(client),
269+
logpipelineotel.WithGlobals(config.Global),
270+
271+
logpipelineotel.WithAgentApplierDeleter(otelcollector.NewLogAgentApplierDeleter(config.Global, config.OTelCollectorImage, config.LogAgentPriorityClassName)),
272+
logpipelineotel.WithAgentConfigBuilder(agentConfigBuilder),
273+
logpipelineotel.WithAgentFlowHealthProber(agentFlowHealthProber),
274+
logpipelineotel.WithAgentProber(&workloadstatus.DaemonSetProber{Client: client}),
275+
276+
logpipelineotel.WithErrorToMessageConverter(&conditions.ErrorToMessageConverter{}),
277+
278+
logpipelineotel.WithGatewayApplierDeleter(otelcollector.NewLogGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.LogGatewayPriorityClassName)),
279+
logpipelineotel.WithGatewayConfigBuilder(&loggateway.Builder{Reader: client}),
280+
logpipelineotel.WithGatewayFlowHealthProber(gatewayFlowHealthProber),
281+
logpipelineotel.WithGatewayProber(&workloadstatus.DeploymentProber{Client: client}),
282+
283+
logpipelineotel.WithIstioStatusChecker(istiostatus.NewChecker(discoveryClient)),
284+
logpipelineotel.WithPipelineLock(pipelineLock),
285+
logpipelineotel.WithPipelineValidator(pipelineValidator),
286+
)
280287

281288
return otelReconciler, nil
282289
}

controllers/telemetry/metricpipeline_controller.go

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,14 @@ func NewMetricPipelineController(config MetricPipelineControllerConfig, client c
112112
return nil, err
113113
}
114114

115-
pipelineValidator := &metricpipeline.Validator{
116-
EndpointValidator: &endpoint.Validator{Client: client},
117-
TLSCertValidator: tlscert.New(client),
118-
SecretRefValidator: &secretref.Validator{Client: client},
119-
PipelineLock: pipelineLock,
120-
TransformSpecValidator: transformSpecValidator,
121-
FilterSpecValidator: filterSpecValidator,
122-
}
115+
pipelineValidator := metricpipeline.NewValidator(
116+
metricpipeline.WithEndpointValidator(&endpoint.Validator{Client: client}),
117+
metricpipeline.WithTLSCertValidator(tlscert.New(client)),
118+
metricpipeline.WithSecretRefValidator(&secretref.Validator{Client: client}),
119+
metricpipeline.WithValidatorPipelineLock(pipelineLock),
120+
metricpipeline.WithTransformSpecValidator(transformSpecValidator),
121+
metricpipeline.WithFilterSpecValidator(filterSpecValidator),
122+
)
123123

124124
discoveryClient, err := discovery.NewDiscoveryClientForConfig(config.RestConfig)
125125
if err != nil {
@@ -130,22 +130,26 @@ func NewMetricPipelineController(config MetricPipelineControllerConfig, client c
130130
gatewayConfigBuilder := &metricgateway.Builder{Reader: client}
131131

132132
reconciler := metricpipeline.New(
133-
config.Global,
134-
client,
135-
otelcollector.NewMetricAgentApplierDeleter(config.Global, config.OTelCollectorImage, config.MetricAgentPriorityClassName),
136-
agentConfigBuilder,
137-
&workloadstatus.DaemonSetProber{Client: client},
138-
gatewayFlowHealthProber,
139-
agentFlowHealthProber,
140-
otelcollector.NewMetricGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.MetricGatewayPriorityClassName),
141-
gatewayConfigBuilder,
142-
&workloadstatus.DeploymentProber{Client: client},
143-
istiostatus.NewChecker(discoveryClient),
144-
overrides.New(config.Global, client),
145-
pipelineLock,
146-
pipelineSync,
147-
pipelineValidator,
148-
&conditions.ErrorToMessageConverter{},
133+
metricpipeline.WithClient(client),
134+
metricpipeline.WithGlobals(config.Global),
135+
136+
metricpipeline.WithAgentApplierDeleter(otelcollector.NewMetricAgentApplierDeleter(config.Global, config.OTelCollectorImage, config.MetricAgentPriorityClassName)),
137+
metricpipeline.WithAgentConfigBuilder(agentConfigBuilder),
138+
metricpipeline.WithAgentFlowHealthProber(agentFlowHealthProber),
139+
metricpipeline.WithAgentProber(&workloadstatus.DaemonSetProber{Client: client}),
140+
141+
metricpipeline.WithGatewayApplierDeleter(otelcollector.NewMetricGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.MetricGatewayPriorityClassName)),
142+
metricpipeline.WithGatewayConfigBuilder(gatewayConfigBuilder),
143+
metricpipeline.WithGatewayFlowHealthProber(gatewayFlowHealthProber),
144+
metricpipeline.WithGatewayProber(&workloadstatus.DeploymentProber{Client: client}),
145+
146+
metricpipeline.WithErrorToMessageConverter(&conditions.ErrorToMessageConverter{}),
147+
metricpipeline.WithIstioStatusChecker(istiostatus.NewChecker(discoveryClient)),
148+
metricpipeline.WithOverridesHandler(overrides.New(config.Global, client)),
149+
metricpipeline.WithPipelineValidator(pipelineValidator),
150+
151+
metricpipeline.WithPipelineLock(pipelineLock),
152+
metricpipeline.WithPipelineSyncer(pipelineSync),
149153
)
150154

151155
return &MetricPipelineController{

controllers/telemetry/tracepipeline_controller.go

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -105,33 +105,37 @@ func NewTracePipelineController(config TracePipelineControllerConfig, client cli
105105
return nil, err
106106
}
107107

108-
pipelineValidator := &tracepipeline.Validator{
109-
EndpointValidator: &endpoint.Validator{Client: client},
110-
TLSCertValidator: tlscert.New(client),
111-
SecretRefValidator: &secretref.Validator{Client: client},
112-
PipelineLock: pipelineLock,
113-
TransformSpecValidator: transformSpecValidator,
114-
FilterSpecValidator: filterSpecValidator,
115-
}
108+
pipelineValidator := tracepipeline.NewValidator(
109+
tracepipeline.WithEndpointValidator(&endpoint.Validator{Client: client}),
110+
tracepipeline.WithTLSCertValidator(tlscert.New(client)),
111+
tracepipeline.WithSecretRefValidator(&secretref.Validator{Client: client}),
112+
tracepipeline.WithValidatorPipelineLock(pipelineLock),
113+
tracepipeline.WithTransformSpecValidator(transformSpecValidator),
114+
tracepipeline.WithFilterSpecValidator(filterSpecValidator),
115+
)
116116

117117
discoveryClient, err := discovery.NewDiscoveryClientForConfig(config.RestConfig)
118118
if err != nil {
119119
return nil, err
120120
}
121121

122122
reconciler := tracepipeline.New(
123-
client,
124-
config.Global,
125-
flowHealthProber,
126-
otelcollector.NewTraceGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.TraceGatewayPriorityClassName),
127-
&tracegateway.Builder{Reader: client},
128-
&workloadstatus.DeploymentProber{Client: client},
129-
istiostatus.NewChecker(discoveryClient),
130-
overrides.New(config.Global, client),
131-
pipelineLock,
132-
pipelineSync,
133-
pipelineValidator,
134-
&conditions.ErrorToMessageConverter{})
123+
tracepipeline.WithClient(client),
124+
tracepipeline.WithGlobal(config.Global),
125+
126+
tracepipeline.WithGatewayApplierDeleter(otelcollector.NewTraceGatewayApplierDeleter(config.Global, config.OTelCollectorImage, config.TraceGatewayPriorityClassName)),
127+
tracepipeline.WithGatewayConfigBuilder(&tracegateway.Builder{Reader: client}),
128+
tracepipeline.WithGatewayProber(&workloadstatus.DeploymentProber{Client: client}),
129+
130+
tracepipeline.WithFlowHealthProber(flowHealthProber),
131+
tracepipeline.WithIstioStatusChecker(istiostatus.NewChecker(discoveryClient)),
132+
tracepipeline.WithOverridesHandler(overrides.New(config.Global, client)),
133+
tracepipeline.WithErrorToMessageConverter(&conditions.ErrorToMessageConverter{}),
134+
135+
tracepipeline.WithPipelineLock(pipelineLock),
136+
tracepipeline.WithPipelineSyncer(pipelineSync),
137+
tracepipeline.WithPipelineValidator(pipelineValidator),
138+
)
135139

136140
return &TracePipelineController{
137141
Client: client,

docs/contributor/arch/019-switch-from-gateways-to-a-central-agent.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,34 @@ We conducted the following PoCs:
113113
## Conclusion
114114

115115
The original motivation for the gateway concept is no longer relevant. Transitioning to the agent approach resolves many issues while introducing only minor drawbacks. However, inadequate retry handling by OTel SDKs and Istio proxies remains a challenge. Before proceeding, we must ensure this issue is addressed, considering its alignment with the OTLP specification.
116+
117+
### Agent Rollout and Zero-Downtime Updates
118+
119+
To perform a zero-downtime rollout of a `DaemonSet`, use the `RollingUpdate` update strategy with `maxUnavailable: 0` and `maxSurge: 1`. See the following example:
120+
121+
```yaml
122+
apiVersion: apps/v1
123+
kind: DaemonSet
124+
metadata:
125+
name: otel
126+
namespace: otel
127+
spec:
128+
selector:
129+
matchLabels:
130+
app: otel-col
131+
updateStrategy:
132+
type: RollingUpdate
133+
rollingUpdate:
134+
maxUnavailable: 0
135+
maxSurge: 1
136+
...
137+
```
138+
139+
With this configuration, the `DaemonSet` creates a new Pod on each node before terminating the old one, ensuring that a replacement Pod is fully running before any disruption occurs.
140+
141+
This setup was tested using the OpenTelemetry Demo application to verify that no telemetry data is lost during the rollout. Tests were executed for all three signal types: metrics, traces, and logs. The demo application, which includes services written in various programming languages and uses the OTel SDK, continued sending telemetry to the test collector throughout the rollout without any data loss.
142+
143+
The setup was also tested using `TelemetryGen.` TelemetryGen successfully detected endpoint changes after the rollout and continued sending telemetry to the new endpoints.
144+
145+
The same behavior was confirmed with `Istio access logs`. After the rollout, log data continued flowing to the new endpoints without data loss or duplication.
146+

docs/user/integration/dynatrace/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,30 @@ We recommend direct integration with the Dynatrace server. This approach reduces
170170
metadata:
171171
name: dynatrace
172172
spec:
173+
transform:
174+
- statements:
175+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.statefulset.name"]) where IsString(resource.attributes["k8s.statefulset.name"])
176+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.replicaset.name"]) where IsString(resource.attributes["k8s.replicaset.name"])
177+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.job.name"]) where IsString(resource.attributes["k8s.job.name"])
178+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.deployment.name"]) where IsString(resource.attributes["k8s.deployment.name"])
179+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.daemonset.name"]) where IsString(resource.attributes["k8s.daemonset.name"])
180+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.cronjob.name"]) where IsString(resource.attributes["k8s.cronjob.name"])
181+
- set(resource.attributes["k8s.workload.kind"], "statefulset") where IsString(resource.attributes["k8s.statefulset.name"])
182+
- set(resource.attributes["k8s.workload.kind"], "replicaset") where IsString(resource.attributes["k8s.replicaset.name"])
183+
- set(resource.attributes["k8s.workload.kind"], "job") where IsString(resource.attributes["k8s.job.name"])
184+
- set(resource.attributes["k8s.workload.kind"], "deployment") where IsString(resource.attributes["k8s.deployment.name"])
185+
- set(resource.attributes["k8s.workload.kind"], "daemonset") where IsString(resource.attributes["k8s.daemonset.name"])
186+
- set(resource.attributes["k8s.workload.kind"], "cronjob") where IsString(resource.attributes["k8s.cronjob.name"])
187+
- set(resource.attributes["dt.kubernetes.workload.name"], resource.attributes["k8s.workload.name"])
188+
- set(resource.attributes["dt.kubernetes.workload.kind"], resource.attributes["k8s.workload.kind"])
189+
- delete_key(resource.attributes, "k8s.statefulset.name")
190+
- delete_key(resource.attributes, "k8s.replicaset.name")
191+
- delete_key(resource.attributes, "k8s.job.name")
192+
- delete_key(resource.attributes, "k8s.deployment.name")
193+
- delete_key(resource.attributes, "k8s.daemonset.name")
194+
- delete_key(resource.attributes, "k8s.cronjob.name")
195+
- statements:
196+
- set(resource.attributes["k8s.pod.ip"], resource.attributes["ip"]) where resource.attributes["k8s.pod.ip"] == nil
173197
output:
174198
otlp:
175199
endpoint:
@@ -261,6 +285,30 @@ Depending on your metrics source and temporality, choose one of the following me
261285
metadata:
262286
name: dynatrace
263287
spec:
288+
transform:
289+
- statements:
290+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.statefulset.name"]) where IsString(resource.attributes["k8s.statefulset.name"])
291+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.replicaset.name"]) where IsString(resource.attributes["k8s.replicaset.name"])
292+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.job.name"]) where IsString(resource.attributes["k8s.job.name"])
293+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.deployment.name"]) where IsString(resource.attributes["k8s.deployment.name"])
294+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.daemonset.name"]) where IsString(resource.attributes["k8s.daemonset.name"])
295+
- set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.cronjob.name"]) where IsString(resource.attributes["k8s.cronjob.name"])
296+
- set(resource.attributes["k8s.workload.kind"], "statefulset") where IsString(resource.attributes["k8s.statefulset.name"])
297+
- set(resource.attributes["k8s.workload.kind"], "replicaset") where IsString(resource.attributes["k8s.replicaset.name"])
298+
- set(resource.attributes["k8s.workload.kind"], "job") where IsString(resource.attributes["k8s.job.name"])
299+
- set(resource.attributes["k8s.workload.kind"], "deployment") where IsString(resource.attributes["k8s.deployment.name"])
300+
- set(resource.attributes["k8s.workload.kind"], "daemonset") where IsString(resource.attributes["k8s.daemonset.name"])
301+
- set(resource.attributes["k8s.workload.kind"], "cronjob") where IsString(resource.attributes["k8s.cronjob.name"])
302+
- set(resource.attributes["dt.kubernetes.workload.name"], resource.attributes["k8s.workload.name"])
303+
- set(resource.attributes["dt.kubernetes.workload.kind"], resource.attributes["k8s.workload.kind"])
304+
- delete_key(resource.attributes, "k8s.statefulset.name")
305+
- delete_key(resource.attributes, "k8s.replicaset.name")
306+
- delete_key(resource.attributes, "k8s.job.name")
307+
- delete_key(resource.attributes, "k8s.deployment.name")
308+
- delete_key(resource.attributes, "k8s.daemonset.name")
309+
- delete_key(resource.attributes, "k8s.cronjob.name")
310+
- statements:
311+
- set(resource.attributes["k8s.pod.ip"], resource.attributes["ip"]) where resource.attributes["k8s.pod.ip"] == nil
264312
output:
265313
otlp:
266314
endpoint:

docs/user/integration/k8s-events/cloud-logging-dashboard.ndjson

Lines changed: 1 addition & 2 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)