Skip to content

Commit 3ef23b0

Browse files
authored
feat: implement opentelemetry for distributed tracing (#706)
* feat: add otel & tracing for distributed tracing Signed-off-by: namkyu1999 <[email protected]> * feat: add tracing codes to chaslib Signed-off-by: namkyu1999 <[email protected]> * fix: misc Signed-off-by: namkyu1999 <[email protected]> * fix: make otel optional Signed-off-by: namkyu1999 <[email protected]> * fix: skip if litmus-go not received trace_parent Signed-off-by: namkyu1999 <[email protected]> * fix: Set context.Context as a parameter in each function Signed-off-by: namkyu1999 <[email protected]> * update templates Signed-off-by: namkyu1999 <[email protected]> * feat: rename spans and enhance coverage Signed-off-by: namkyu1999 <[email protected]> * fix: avoid shadowing Signed-off-by: namkyu1999 <[email protected]> * fix: add logs Signed-off-by: namkyu1999 <[email protected]> * fix: add logs Signed-off-by: namkyu1999 <[email protected]> * fix: fix templates Signed-off-by: namkyu1999 <[email protected]> --------- Signed-off-by: namkyu1999 <[email protected]>
1 parent 0cd6c6f commit 3ef23b0

File tree

113 files changed

+1496
-816
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+1496
-816
lines changed

bin/experiment/experiment.go

Lines changed: 73 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package main
22

33
import (
4+
"context"
5+
"errors"
46
"flag"
7+
"os"
8+
59
// Uncomment to load all auth plugins
610
// _ "k8s.io/client-go/plugin/pkg/client/auth"
711

@@ -59,10 +63,11 @@ import (
5963
k6Loadgen "github.com/litmuschaos/litmus-go/experiments/load/k6-loadgen/experiment"
6064
springBootFaults "github.com/litmuschaos/litmus-go/experiments/spring-boot/spring-boot-faults/experiment"
6165
vmpoweroff "github.com/litmuschaos/litmus-go/experiments/vmware/vm-poweroff/experiment"
62-
63-
"github.com/litmuschaos/litmus-go/pkg/clients"
66+
cli "github.com/litmuschaos/litmus-go/pkg/clients"
6467
"github.com/litmuschaos/litmus-go/pkg/log"
68+
"github.com/litmuschaos/litmus-go/pkg/telemetry"
6569
"github.com/sirupsen/logrus"
70+
"go.opentelemetry.io/otel"
6671
)
6772

6873
func init() {
@@ -75,8 +80,25 @@ func init() {
7580
}
7681

7782
func main() {
83+
initCtx := context.Background()
84+
85+
// Set up Observability.
86+
if otelExporterEndpoint := os.Getenv(telemetry.OTELExporterOTLPEndpoint); otelExporterEndpoint != "" {
87+
shutdown, err := telemetry.InitOTelSDK(initCtx, true, otelExporterEndpoint)
88+
if err != nil {
89+
log.Errorf("Failed to initialize OTel SDK: %v", err)
90+
return
91+
}
92+
defer func() {
93+
err = errors.Join(err, shutdown(initCtx))
94+
}()
95+
initCtx = telemetry.GetTraceParentContext()
96+
}
97+
98+
clients := cli.ClientSets{}
7899

79-
clients := clients.ClientSets{}
100+
ctx, span := otel.Tracer(telemetry.TracerName).Start(initCtx, "ExecuteExperiment")
101+
defer span.End()
80102

81103
// parse the experiment name
82104
experimentName := flag.String("name", "pod-delete", "name of the chaos experiment")
@@ -92,101 +114,101 @@ func main() {
92114
// invoke the corresponding experiment based on the (-name) flag
93115
switch *experimentName {
94116
case "container-kill":
95-
containerKill.ContainerKill(clients)
117+
containerKill.ContainerKill(ctx, clients)
96118
case "disk-fill":
97-
diskFill.DiskFill(clients)
119+
diskFill.DiskFill(ctx, clients)
98120
case "kafka-broker-pod-failure":
99-
kafkaBrokerPodFailure.KafkaBrokerPodFailure(clients)
121+
kafkaBrokerPodFailure.KafkaBrokerPodFailure(ctx, clients)
100122
case "kubelet-service-kill":
101-
kubeletServiceKill.KubeletServiceKill(clients)
123+
kubeletServiceKill.KubeletServiceKill(ctx, clients)
102124
case "docker-service-kill":
103-
dockerServiceKill.DockerServiceKill(clients)
125+
dockerServiceKill.DockerServiceKill(ctx, clients)
104126
case "node-cpu-hog":
105-
nodeCPUHog.NodeCPUHog(clients)
127+
nodeCPUHog.NodeCPUHog(ctx, clients)
106128
case "node-drain":
107-
nodeDrain.NodeDrain(clients)
129+
nodeDrain.NodeDrain(ctx, clients)
108130
case "node-io-stress":
109-
nodeIOStress.NodeIOStress(clients)
131+
nodeIOStress.NodeIOStress(ctx, clients)
110132
case "node-memory-hog":
111-
nodeMemoryHog.NodeMemoryHog(clients)
133+
nodeMemoryHog.NodeMemoryHog(ctx, clients)
112134
case "node-taint":
113-
nodeTaint.NodeTaint(clients)
135+
nodeTaint.NodeTaint(ctx, clients)
114136
case "pod-autoscaler":
115-
podAutoscaler.PodAutoscaler(clients)
137+
podAutoscaler.PodAutoscaler(ctx, clients)
116138
case "pod-cpu-hog-exec":
117-
podCPUHogExec.PodCPUHogExec(clients)
139+
podCPUHogExec.PodCPUHogExec(ctx, clients)
118140
case "pod-delete":
119-
podDelete.PodDelete(clients)
141+
podDelete.PodDelete(ctx, clients)
120142
case "pod-io-stress":
121-
podIOStress.PodIOStress(clients)
143+
podIOStress.PodIOStress(ctx, clients)
122144
case "pod-memory-hog-exec":
123-
podMemoryHogExec.PodMemoryHogExec(clients)
145+
podMemoryHogExec.PodMemoryHogExec(ctx, clients)
124146
case "pod-network-corruption":
125-
podNetworkCorruption.PodNetworkCorruption(clients)
147+
podNetworkCorruption.PodNetworkCorruption(ctx, clients)
126148
case "pod-network-duplication":
127-
podNetworkDuplication.PodNetworkDuplication(clients)
149+
podNetworkDuplication.PodNetworkDuplication(ctx, clients)
128150
case "pod-network-latency":
129-
podNetworkLatency.PodNetworkLatency(clients)
151+
podNetworkLatency.PodNetworkLatency(ctx, clients)
130152
case "pod-network-loss":
131-
podNetworkLoss.PodNetworkLoss(clients)
153+
podNetworkLoss.PodNetworkLoss(ctx, clients)
132154
case "pod-network-partition":
133-
podNetworkPartition.PodNetworkPartition(clients)
155+
podNetworkPartition.PodNetworkPartition(ctx, clients)
134156
case "pod-memory-hog":
135-
podMemoryHog.PodMemoryHog(clients)
157+
podMemoryHog.PodMemoryHog(ctx, clients)
136158
case "pod-cpu-hog":
137-
podCPUHog.PodCPUHog(clients)
159+
podCPUHog.PodCPUHog(ctx, clients)
138160
case "cassandra-pod-delete":
139-
cassandraPodDelete.CasssandraPodDelete(clients)
161+
cassandraPodDelete.CasssandraPodDelete(ctx, clients)
140162
case "aws-ssm-chaos-by-id":
141-
awsSSMChaosByID.AWSSSMChaosByID(clients)
163+
awsSSMChaosByID.AWSSSMChaosByID(ctx, clients)
142164
case "aws-ssm-chaos-by-tag":
143-
awsSSMChaosByTag.AWSSSMChaosByTag(clients)
165+
awsSSMChaosByTag.AWSSSMChaosByTag(ctx, clients)
144166
case "ec2-terminate-by-id":
145-
ec2TerminateByID.EC2TerminateByID(clients)
167+
ec2TerminateByID.EC2TerminateByID(ctx, clients)
146168
case "ec2-terminate-by-tag":
147-
ec2TerminateByTag.EC2TerminateByTag(clients)
169+
ec2TerminateByTag.EC2TerminateByTag(ctx, clients)
148170
case "ebs-loss-by-id":
149-
ebsLossByID.EBSLossByID(clients)
171+
ebsLossByID.EBSLossByID(ctx, clients)
150172
case "ebs-loss-by-tag":
151-
ebsLossByTag.EBSLossByTag(clients)
173+
ebsLossByTag.EBSLossByTag(ctx, clients)
152174
case "node-restart":
153-
nodeRestart.NodeRestart(clients)
175+
nodeRestart.NodeRestart(ctx, clients)
154176
case "pod-dns-error":
155-
podDNSError.PodDNSError(clients)
177+
podDNSError.PodDNSError(ctx, clients)
156178
case "pod-dns-spoof":
157-
podDNSSpoof.PodDNSSpoof(clients)
179+
podDNSSpoof.PodDNSSpoof(ctx, clients)
158180
case "pod-http-latency":
159-
podHttpLatency.PodHttpLatency(clients)
181+
podHttpLatency.PodHttpLatency(ctx, clients)
160182
case "pod-http-status-code":
161-
podHttpStatusCode.PodHttpStatusCode(clients)
183+
podHttpStatusCode.PodHttpStatusCode(ctx, clients)
162184
case "pod-http-modify-header":
163-
podHttpModifyHeader.PodHttpModifyHeader(clients)
185+
podHttpModifyHeader.PodHttpModifyHeader(ctx, clients)
164186
case "pod-http-modify-body":
165-
podHttpModifyBody.PodHttpModifyBody(clients)
187+
podHttpModifyBody.PodHttpModifyBody(ctx, clients)
166188
case "pod-http-reset-peer":
167-
podHttpResetPeer.PodHttpResetPeer(clients)
189+
podHttpResetPeer.PodHttpResetPeer(ctx, clients)
168190
case "vm-poweroff":
169-
vmpoweroff.VMPoweroff(clients)
191+
vmpoweroff.VMPoweroff(ctx, clients)
170192
case "azure-instance-stop":
171-
azureInstanceStop.AzureInstanceStop(clients)
193+
azureInstanceStop.AzureInstanceStop(ctx, clients)
172194
case "azure-disk-loss":
173-
azureDiskLoss.AzureDiskLoss(clients)
195+
azureDiskLoss.AzureDiskLoss(ctx, clients)
174196
case "gcp-vm-disk-loss":
175-
gcpVMDiskLoss.VMDiskLoss(clients)
197+
gcpVMDiskLoss.VMDiskLoss(ctx, clients)
176198
case "pod-fio-stress":
177-
podFioStress.PodFioStress(clients)
199+
podFioStress.PodFioStress(ctx, clients)
178200
case "gcp-vm-instance-stop":
179-
gcpVMInstanceStop.VMInstanceStop(clients)
201+
gcpVMInstanceStop.VMInstanceStop(ctx, clients)
180202
case "redfish-node-restart":
181-
redfishNodeRestart.NodeRestart(clients)
203+
redfishNodeRestart.NodeRestart(ctx, clients)
182204
case "gcp-vm-instance-stop-by-label":
183-
gcpVMInstanceStopByLabel.GCPVMInstanceStopByLabel(clients)
205+
gcpVMInstanceStopByLabel.GCPVMInstanceStopByLabel(ctx, clients)
184206
case "gcp-vm-disk-loss-by-label":
185-
gcpVMDiskLossByLabel.GCPVMDiskLossByLabel(clients)
207+
gcpVMDiskLossByLabel.GCPVMDiskLossByLabel(ctx, clients)
186208
case "spring-boot-cpu-stress", "spring-boot-memory-stress", "spring-boot-exceptions", "spring-boot-app-kill", "spring-boot-faults", "spring-boot-latency":
187-
springBootFaults.Experiment(clients, *experimentName)
209+
springBootFaults.Experiment(ctx, clients, *experimentName)
188210
case "k6-loadgen":
189-
k6Loadgen.Experiment(clients)
211+
k6Loadgen.Experiment(ctx, clients)
190212
default:
191213
log.Errorf("Unsupported -name %v, please provide the correct value of -name args", *experimentName)
192214
return

bin/helper/helper.go

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package main
22

33
import (
4+
"context"
5+
"errors"
46
"flag"
7+
"os"
8+
59
// Uncomment to load all auth plugins
610
// _ "k8s.io/client-go/plugin/pkg/client/auth"
711

@@ -17,10 +21,11 @@ import (
1721
networkChaos "github.com/litmuschaos/litmus-go/chaoslib/litmus/network-chaos/helper"
1822
dnsChaos "github.com/litmuschaos/litmus-go/chaoslib/litmus/pod-dns-chaos/helper"
1923
stressChaos "github.com/litmuschaos/litmus-go/chaoslib/litmus/stress-chaos/helper"
20-
21-
"github.com/litmuschaos/litmus-go/pkg/clients"
24+
cli "github.com/litmuschaos/litmus-go/pkg/clients"
2225
"github.com/litmuschaos/litmus-go/pkg/log"
26+
"github.com/litmuschaos/litmus-go/pkg/telemetry"
2327
"github.com/sirupsen/logrus"
28+
"go.opentelemetry.io/otel"
2429
)
2530

2631
func init() {
@@ -33,8 +38,24 @@ func init() {
3338
}
3439

3540
func main() {
41+
ctx := context.Background()
42+
// Set up Observability.
43+
if otelExporterEndpoint := os.Getenv(telemetry.OTELExporterOTLPEndpoint); otelExporterEndpoint != "" {
44+
shutdown, err := telemetry.InitOTelSDK(ctx, true, otelExporterEndpoint)
45+
if err != nil {
46+
log.Errorf("Failed to initialize OTel SDK: %v", err)
47+
return
48+
}
49+
defer func() {
50+
err = errors.Join(err, shutdown(ctx))
51+
}()
52+
ctx = telemetry.GetTraceParentContext()
53+
}
54+
55+
clients := cli.ClientSets{}
3656

37-
clients := clients.ClientSets{}
57+
_, span := otel.Tracer(telemetry.TracerName).Start(ctx, "ExecuteExperimentHelper")
58+
defer span.End()
3859

3960
// parse the helper name
4061
helperName := flag.String("name", "", "name of the helper pod")

build/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Multi-stage docker build
22
# Build stage
3-
FROM golang:1.20 AS builder
3+
FROM golang:1.22 AS builder
44

55
ARG TARGETOS=linux
66
ARG TARGETARCH

chaoslib/litmus/aws-ssm-chaos/lib/ssm-chaos.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,28 @@
11
package lib
22

33
import (
4+
"context"
45
"os"
56
"strings"
67
"time"
78

89
experimentTypes "github.com/litmuschaos/litmus-go/pkg/aws-ssm/aws-ssm-chaos/types"
9-
clients "github.com/litmuschaos/litmus-go/pkg/clients"
10+
"github.com/litmuschaos/litmus-go/pkg/clients"
1011
"github.com/litmuschaos/litmus-go/pkg/cloud/aws/ssm"
1112
"github.com/litmuschaos/litmus-go/pkg/events"
1213
"github.com/litmuschaos/litmus-go/pkg/log"
1314
"github.com/litmuschaos/litmus-go/pkg/probe"
15+
"github.com/litmuschaos/litmus-go/pkg/telemetry"
1416
"github.com/litmuschaos/litmus-go/pkg/types"
1517
"github.com/litmuschaos/litmus-go/pkg/utils/common"
1618
"github.com/palantir/stacktrace"
19+
"go.opentelemetry.io/otel"
1720
)
1821

1922
// InjectChaosInSerialMode will inject the aws ssm chaos in serial mode that is one after other
20-
func InjectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
23+
func InjectChaosInSerialMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
24+
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "InjectAWSSSMFaultInSerialMode")
25+
defer span.End()
2126

2227
select {
2328
case <-inject:
@@ -60,7 +65,7 @@ func InjectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
6065

6166
// run the probes during chaos
6267
if len(resultDetails.ProbeDetails) != 0 && i == 0 {
63-
if err = probe.RunProbes(chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
68+
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
6469
return stacktrace.Propagate(err, "failed to run probes")
6570
}
6671
}
@@ -85,7 +90,9 @@ func InjectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
8590
}
8691

8792
// InjectChaosInParallelMode will inject the aws ssm chaos in parallel mode that is all at once
88-
func InjectChaosInParallelMode(experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
93+
func InjectChaosInParallelMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
94+
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "InjectAWSSSMFaultInParallelMode")
95+
defer span.End()
8996

9097
select {
9198
case <-inject:
@@ -125,7 +132,7 @@ func InjectChaosInParallelMode(experimentsDetails *experimentTypes.ExperimentDet
125132

126133
// run the probes during chaos
127134
if len(resultDetails.ProbeDetails) != 0 {
128-
if err = probe.RunProbes(chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
135+
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
129136
return stacktrace.Propagate(err, "failed to run probes")
130137
}
131138
}

chaoslib/litmus/aws-ssm-chaos/lib/ssm/aws-ssm-chaos-by-id.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package ssm
22

33
import (
4+
"context"
45
"fmt"
56
"os"
67
"os/signal"
@@ -10,12 +11,14 @@ import (
1011
"github.com/litmuschaos/litmus-go/chaoslib/litmus/aws-ssm-chaos/lib"
1112
experimentTypes "github.com/litmuschaos/litmus-go/pkg/aws-ssm/aws-ssm-chaos/types"
1213
"github.com/litmuschaos/litmus-go/pkg/cerrors"
13-
clients "github.com/litmuschaos/litmus-go/pkg/clients"
14+
"github.com/litmuschaos/litmus-go/pkg/clients"
1415
"github.com/litmuschaos/litmus-go/pkg/cloud/aws/ssm"
1516
"github.com/litmuschaos/litmus-go/pkg/log"
17+
"github.com/litmuschaos/litmus-go/pkg/telemetry"
1618
"github.com/litmuschaos/litmus-go/pkg/types"
1719
"github.com/litmuschaos/litmus-go/pkg/utils/common"
1820
"github.com/palantir/stacktrace"
21+
"go.opentelemetry.io/otel"
1922
)
2023

2124
var (
@@ -24,7 +27,9 @@ var (
2427
)
2528

2629
// PrepareAWSSSMChaosByID contains the prepration and injection steps for the experiment
27-
func PrepareAWSSSMChaosByID(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
30+
func PrepareAWSSSMChaosByID(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
31+
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "PrepareAWSSSMFaultByID")
32+
defer span.End()
2833

2934
// inject channel is used to transmit signal notifications.
3035
inject = make(chan os.Signal, 1)
@@ -60,11 +65,11 @@ func PrepareAWSSSMChaosByID(experimentsDetails *experimentTypes.ExperimentDetail
6065

6166
switch strings.ToLower(experimentsDetails.Sequence) {
6267
case "serial":
63-
if err = lib.InjectChaosInSerialMode(experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
68+
if err = lib.InjectChaosInSerialMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
6469
return stacktrace.Propagate(err, "could not run chaos in serial mode")
6570
}
6671
case "parallel":
67-
if err = lib.InjectChaosInParallelMode(experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
72+
if err = lib.InjectChaosInParallelMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
6873
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
6974
}
7075
default:

0 commit comments

Comments
 (0)