Skip to content

Commit 22ee6df

Browse files
committed
feat: ditributed tracing span error (pod-delete only)
Signed-off-by: Jaeyeon Park <[email protected]>
1 parent 8246ff8 commit 22ee6df

File tree

4 files changed

+80
-4
lines changed

4 files changed

+80
-4
lines changed

bin/experiment/experiment.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"flag"
7+
"fmt"
78
"os"
89

910
// Uncomment to load all auth plugins
@@ -68,6 +69,7 @@ import (
6869
"github.com/litmuschaos/litmus-go/pkg/telemetry"
6970
"github.com/sirupsen/logrus"
7071
"go.opentelemetry.io/otel"
72+
"go.opentelemetry.io/otel/codes"
7173
)
7274

7375
func init() {
@@ -106,6 +108,8 @@ func main() {
106108
//Getting kubeConfig and Generate ClientSets
107109
if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
108110
log.Errorf("Unable to Get the kubeconfig, err: %v", err)
111+
span.SetStatus(codes.Error, "Unable to Get the kubeconfig")
112+
span.RecordError(err)
109113
return
110114
}
111115

@@ -211,6 +215,7 @@ func main() {
211215
k6Loadgen.Experiment(ctx, clients)
212216
default:
213217
log.Errorf("Unsupported -name %v, please provide the correct value of -name args", *experimentName)
218+
span.SetStatus(codes.Error, fmt.Sprintf("Unsupported -name %v", *experimentName))
214219
return
215220
}
216221
}

chaoslib/litmus/pod-delete/lib/pod-delete.go

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/palantir/stacktrace"
2222
"github.com/sirupsen/logrus"
2323
"go.opentelemetry.io/otel"
24+
"go.opentelemetry.io/otel/codes"
2425
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2526
)
2627

@@ -46,14 +47,22 @@ func PreparePodDelete(ctx context.Context, experimentsDetails *experimentTypes.E
4647
switch strings.ToLower(experimentsDetails.Sequence) {
4748
case "serial":
4849
if err := injectChaosInSerialMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
50+
span.SetStatus(codes.Error, "could not run chaos in serial mode")
51+
span.RecordError(err)
4952
return stacktrace.Propagate(err, "could not run chaos in serial mode")
5053
}
5154
case "parallel":
5255
if err := injectChaosInParallelMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
56+
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
57+
span.RecordError(err)
5358
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
5459
}
5560
default:
56-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
61+
errReason := fmt.Sprintf("sequence '%s' is not supported", experimentsDetails.Sequence)
62+
span.SetStatus(codes.Error, errReason)
63+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: errReason}
64+
span.RecordError(err)
65+
return err
5766
}
5867

5968
//Waiting for the ramp time after chaos injection
@@ -72,6 +81,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
7281
// run the probes during chaos
7382
if len(resultDetails.ProbeDetails) != 0 {
7483
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
84+
span.SetStatus(codes.Error, "could not run the probes during chaos")
85+
span.RecordError(err)
7586
return err
7687
}
7788
}
@@ -85,18 +96,25 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
8596
// Get the target pod details for the chaos execution
8697
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
8798
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
88-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
99+
span.SetStatus(codes.Error, "provide one of the appLabel or TARGET_PODS")
100+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
101+
span.RecordError(err)
102+
return err
89103
}
90104

91105
targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
92106
if err != nil {
107+
span.SetStatus(codes.Error, "could not get target pods")
108+
span.RecordError(err)
93109
return stacktrace.Propagate(err, "could not get target pods")
94110
}
95111

96112
// deriving the parent name of the target resources
97113
for _, pod := range targetPodList.Items {
98114
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
99115
if err != nil {
116+
span.SetStatus(codes.Error, "could not get pod owner name and kind")
117+
span.RecordError(err)
100118
return stacktrace.Propagate(err, "could not get pod owner name and kind")
101119
}
102120
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
@@ -123,12 +141,16 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
123141
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
124142
}
125143
if err != nil {
144+
span.SetStatus(codes.Error, "could not delete the target pod")
145+
span.RecordError(err)
126146
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
127147
}
128148

129149
switch chaosDetails.Randomness {
130150
case true:
131151
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
152+
span.SetStatus(codes.Error, "could not get random chaos interval")
153+
span.RecordError(err)
132154
return stacktrace.Propagate(err, "could not get random chaos interval")
133155
}
134156
default:
@@ -149,6 +171,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
149171
Namespace: parent.Namespace,
150172
}
151173
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
174+
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
175+
span.RecordError(err)
152176
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
153177
}
154178
}
@@ -184,17 +208,24 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
184208
// Get the target pod details for the chaos execution
185209
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
186210
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
187-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
211+
span.SetStatus(codes.Error, "please provide one of the appLabel or TARGET_PODS")
212+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
213+
span.RecordError(err)
214+
return err
188215
}
189216
targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
190217
if err != nil {
218+
span.SetStatus(codes.Error, "could not get target pods")
219+
span.RecordError(err)
191220
return stacktrace.Propagate(err, "could not get target pods")
192221
}
193222

194223
// deriving the parent name of the target resources
195224
for _, pod := range targetPodList.Items {
196225
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
197226
if err != nil {
227+
span.SetStatus(codes.Error, "could not get pod owner name and kind")
228+
span.RecordError(err)
198229
return stacktrace.Propagate(err, "could not get pod owner name and kind")
199230
}
200231
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
@@ -221,13 +252,16 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
221252
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
222253
}
223254
if err != nil {
255+
span.SetStatus(codes.Error, "could not delete the target pod")
256+
span.RecordError(err)
224257
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
225258
}
226259
}
227260

228261
switch chaosDetails.Randomness {
229262
case true:
230263
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
264+
span.SetStatus(codes.Error, "could not get random chaos interval")
231265
return stacktrace.Propagate(err, "could not get random chaos interval")
232266
}
233267
default:
@@ -248,6 +282,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
248282
Namespace: parent.Namespace,
249283
}
250284
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
285+
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
286+
span.RecordError(err)
251287
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
252288
}
253289
}

experiments/generic/pod-delete/experiment/pod-delete.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,14 @@ import (
1717
"github.com/litmuschaos/litmus-go/pkg/types"
1818
"github.com/litmuschaos/litmus-go/pkg/utils/common"
1919
"github.com/sirupsen/logrus"
20+
"go.opentelemetry.io/otel/codes"
21+
"go.opentelemetry.io/otel/trace"
2022
)
2123

2224
// PodDelete inject the pod-delete chaos
2325
func PodDelete(ctx context.Context, clients clients.ClientSets) {
26+
span := trace.SpanFromContext(ctx)
27+
2428
experimentsDetails := experimentTypes.ExperimentDetails{}
2529
resultDetails := types.ResultDetails{}
2630
eventsDetails := types.EventDetails{}
@@ -40,6 +44,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
4044
// Get values from chaosengine. Bail out upon error, as we haven't entered exp business logic yet
4145
if err := types.GetValuesFromChaosEngine(&chaosDetails, clients, &resultDetails); err != nil {
4246
log.Errorf("Unable to initialize the probes, err: %v", err)
47+
span.SetStatus(codes.Error, "Unable to initialize the probes")
48+
span.RecordError(err)
4349
return
4450
}
4551
}
@@ -49,13 +55,17 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
4955
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "SOT"); err != nil {
5056
log.Errorf("Unable to create the chaosresult, err: %v", err)
5157
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
58+
span.SetStatus(codes.Error, "Unable to create the chaosresult")
59+
span.RecordError(err)
5260
return
5361
}
5462

5563
// Set the chaos result uid
5664
if err := result.SetResultUID(&resultDetails, clients, &chaosDetails); err != nil {
5765
log.Errorf("Unable to set the result uid, err: %v", err)
5866
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
67+
span.SetStatus(codes.Error, "Unable to set the result uid")
68+
span.RecordError(err)
5969
return
6070
}
6171

@@ -85,6 +95,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
8595
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
8696
}
8797
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
98+
span.SetStatus(codes.Error, "Application status check failed")
99+
span.RecordError(err)
88100
return
89101
}
90102
}
@@ -104,6 +116,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
104116
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
105117
}
106118
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
119+
span.SetStatus(codes.Error, "Probe Failed")
120+
span.RecordError(err)
107121
return
108122
}
109123
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
@@ -117,6 +131,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
117131
if err := litmusLIB.PreparePodDelete(ctx, &experimentsDetails, clients, &resultDetails, &eventsDetails, &chaosDetails); err != nil {
118132
log.Errorf("Chaos injection failed, err: %v", err)
119133
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
134+
span.SetStatus(codes.Error, "Chaos injection failed")
135+
span.RecordError(err)
120136
return
121137
}
122138

@@ -132,6 +148,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
132148
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "AUT: Not Running", "Warning", &chaosDetails)
133149
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
134150
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
151+
span.SetStatus(codes.Error, "Application status check failed")
152+
span.RecordError(err)
135153
return
136154
}
137155
}
@@ -150,6 +168,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
150168
log.Errorf("failed to create %v event inside chaosengine", types.PostChaosCheck)
151169
}
152170
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
171+
span.SetStatus(codes.Error, "Probes Failed")
172+
span.RecordError(err)
153173
return
154174
}
155175
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
@@ -165,6 +185,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
165185
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT"); err != nil {
166186
log.Errorf("Unable to update the chaosresult, err: %v", err)
167187
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
188+
span.SetStatus(codes.Error, "Unable to update the chaosresult")
189+
span.RecordError(err)
168190
return
169191
}
170192

pkg/probe/probe.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/palantir/stacktrace"
1919
"github.com/sirupsen/logrus"
2020
"go.opentelemetry.io/otel"
21+
"go.opentelemetry.io/otel/codes"
2122
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223
)
2324

@@ -32,6 +33,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
3233
// get the probes details from the chaosengine
3334
probes, err := getProbesFromChaosEngine(chaosDetails, clients)
3435
if err != nil {
36+
span.SetStatus(codes.Error, "getProbesFromChaosEngine failed")
37+
span.RecordError(err)
3538
return err
3639
}
3740

@@ -42,6 +45,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
4245
switch strings.ToLower(probe.Mode) {
4346
case "sot", "edge", "continuous":
4447
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
48+
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
49+
span.RecordError(err)
4550
return err
4651
}
4752
}
@@ -51,6 +56,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
5156
for _, probe := range probes {
5257
if strings.ToLower(probe.Mode) == "onchaos" {
5358
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
59+
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
60+
span.RecordError(err)
5461
return err
5562
}
5663
}
@@ -72,13 +79,19 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
7279
}
7380
}
7481
if len(probeError) != 0 {
75-
return cerrors.PreserveError{ErrString: fmt.Sprintf("[%s]", strings.Join(probeError, ","))}
82+
errString := fmt.Sprintf("[%s]", strings.Join(probeError, ","))
83+
span.SetStatus(codes.Error, errString)
84+
err := cerrors.PreserveError{ErrString: errString}
85+
span.RecordError(err)
86+
return err
7687
}
7788
// executes the eot and edge modes
7889
for _, probe := range probes {
7990
switch strings.ToLower(probe.Mode) {
8091
case "eot", "edge":
8192
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
93+
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
94+
span.RecordError(err)
8295
return err
8396
}
8497
}

0 commit comments

Comments
 (0)