Skip to content

Commit 9769b8b

Browse files
committed
feat: ditributed tracing span error
Signed-off-by: Jaeyeon Park <[email protected]>
1 parent caae228 commit 9769b8b

File tree

98 files changed

+2274
-110
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+2274
-110
lines changed

bin/experiment/experiment.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"flag"
7+
"fmt"
78
"os"
89

910
// Uncomment to load all auth plugins
@@ -68,6 +69,7 @@ import (
6869
"github.com/litmuschaos/litmus-go/pkg/telemetry"
6970
"github.com/sirupsen/logrus"
7071
"go.opentelemetry.io/otel"
72+
"go.opentelemetry.io/otel/codes"
7173
)
7274

7375
func init() {
@@ -106,6 +108,8 @@ func main() {
106108
//Getting kubeConfig and Generate ClientSets
107109
if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
108110
log.Errorf("Unable to Get the kubeconfig, err: %v", err)
111+
span.SetStatus(codes.Error, "Unable to Get the kubeconfig")
112+
span.RecordError(err)
109113
return
110114
}
111115

@@ -211,6 +215,7 @@ func main() {
211215
k6Loadgen.Experiment(ctx, clients)
212216
default:
213217
log.Errorf("Unsupported -name %v, please provide the correct value of -name args", *experimentName)
218+
span.SetStatus(codes.Error, fmt.Sprintf("Unsupported -name %v", *experimentName))
214219
return
215220
}
216221
}

chaoslib/litmus/aws-ssm-chaos/lib/ssm-chaos.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/litmuschaos/litmus-go/pkg/utils/common"
1818
"github.com/palantir/stacktrace"
1919
"go.opentelemetry.io/otel"
20+
"go.opentelemetry.io/otel/codes"
2021
)
2122

2223
// InjectChaosInSerialMode will inject the aws ssm chaos in serial mode that is one after other
@@ -51,6 +52,8 @@ func InjectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
5152
ec2IDList := strings.Fields(ec2ID)
5253
commandId, err := ssm.SendSSMCommand(experimentsDetails, ec2IDList)
5354
if err != nil {
55+
span.SetStatus(codes.Error, "failed to send ssm command")
56+
span.RecordError(err)
5457
return stacktrace.Propagate(err, "failed to send ssm command")
5558
}
5659
//prepare commands for abort recovery
@@ -59,20 +62,26 @@ func InjectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
5962
//wait for the ssm command to get in running state
6063
log.Info("[Wait]: Waiting for the ssm command to get in InProgress state")
6164
if err := ssm.WaitForCommandStatus("InProgress", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
65+
span.SetStatus(codes.Error, "failed to start ssm command")
66+
span.RecordError(err)
6267
return stacktrace.Propagate(err, "failed to start ssm command")
6368
}
6469
common.SetTargets(ec2ID, "injected", "EC2", chaosDetails)
6570

6671
// run the probes during chaos
6772
if len(resultDetails.ProbeDetails) != 0 && i == 0 {
6873
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
74+
span.SetStatus(codes.Error, "failed to run probes")
75+
span.RecordError(err)
6976
return stacktrace.Propagate(err, "failed to run probes")
7077
}
7178
}
7279

7380
//wait for the ssm command to get succeeded in the given chaos duration
7481
log.Info("[Wait]: Waiting for the ssm command to get completed")
7582
if err := ssm.WaitForCommandStatus("Success", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
83+
span.SetStatus(codes.Error, "failed to send ssm command")
84+
span.RecordError(err)
7685
return stacktrace.Propagate(err, "failed to send ssm command")
7786
}
7887
common.SetTargets(ec2ID, "reverted", "EC2", chaosDetails)
@@ -117,6 +126,8 @@ func InjectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
117126
log.Info("[Chaos]: Starting the ssm command")
118127
commandId, err := ssm.SendSSMCommand(experimentsDetails, instanceIDList)
119128
if err != nil {
129+
span.SetStatus(codes.Error, "failed to send ssm command")
130+
span.RecordError(err)
120131
return stacktrace.Propagate(err, "failed to send ssm command")
121132
}
122133
//prepare commands for abort recovery
@@ -126,13 +137,17 @@ func InjectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
126137
//wait for the ssm command to get in running state
127138
log.Info("[Wait]: Waiting for the ssm command to get in InProgress state")
128139
if err := ssm.WaitForCommandStatus("InProgress", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
140+
span.SetStatus(codes.Error, "failed to start ssm command")
141+
span.RecordError(err)
129142
return stacktrace.Propagate(err, "failed to start ssm command")
130143
}
131144
}
132145

133146
// run the probes during chaos
134147
if len(resultDetails.ProbeDetails) != 0 {
135148
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
149+
span.SetStatus(codes.Error, "failed to run probes")
150+
span.RecordError(err)
136151
return stacktrace.Propagate(err, "failed to run probes")
137152
}
138153
}
@@ -141,6 +156,8 @@ func InjectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
141156
//wait for the ssm command to get succeeded in the given chaos duration
142157
log.Info("[Wait]: Waiting for the ssm command to get completed")
143158
if err := ssm.WaitForCommandStatus("Success", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
159+
span.SetStatus(codes.Error, "failed to send ssm command")
160+
span.RecordError(err)
144161
return stacktrace.Propagate(err, "failed to send ssm command")
145162
}
146163
}

chaoslib/litmus/aws-ssm-chaos/lib/ssm/aws-ssm-chaos-by-id.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/litmuschaos/litmus-go/pkg/utils/common"
2020
"github.com/palantir/stacktrace"
2121
"go.opentelemetry.io/otel"
22+
"go.opentelemetry.io/otel/codes"
2223
)
2324

2425
var (
@@ -49,6 +50,8 @@ func PrepareAWSSSMChaosByID(ctx context.Context, experimentsDetails *experimentT
4950

5051
//create and upload the ssm document on the given aws service monitoring docs
5152
if err = ssm.CreateAndUploadDocument(experimentsDetails.DocumentName, experimentsDetails.DocumentType, experimentsDetails.DocumentFormat, experimentsDetails.DocumentPath, experimentsDetails.Region); err != nil {
53+
span.SetStatus(codes.Error, "could not create and upload the ssm document")
54+
span.RecordError(err)
5255
return stacktrace.Propagate(err, "could not create and upload the ssm document")
5356
}
5457
experimentsDetails.IsDocsUploaded = true
@@ -60,25 +63,37 @@ func PrepareAWSSSMChaosByID(ctx context.Context, experimentsDetails *experimentT
6063
//get the instance id or list of instance ids
6164
instanceIDList := strings.Split(experimentsDetails.EC2InstanceID, ",")
6265
if experimentsDetails.EC2InstanceID == "" || len(instanceIDList) == 0 {
63-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no instance id found for chaos injection"}
66+
span.SetStatus(codes.Error, "no instance id found for chaos injection")
67+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no instance id found for chaos injection"}
68+
span.RecordError(err)
69+
return err
6470
}
6571

6672
switch strings.ToLower(experimentsDetails.Sequence) {
6773
case "serial":
6874
if err = lib.InjectChaosInSerialMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
75+
span.SetStatus(codes.Error, "could not run chaos in serial mode")
76+
span.RecordError(err)
6977
return stacktrace.Propagate(err, "could not run chaos in serial mode")
7078
}
7179
case "parallel":
7280
if err = lib.InjectChaosInParallelMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
81+
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
82+
span.RecordError(err)
7383
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
7484
}
7585
default:
76-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
86+
span.SetStatus(codes.Error, "sequence is not supported")
87+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
88+
span.RecordError(err)
89+
return err
7790
}
7891

7992
//Delete the ssm document on the given aws service monitoring docs
8093
err = ssm.SSMDeleteDocument(experimentsDetails.DocumentName, experimentsDetails.Region)
8194
if err != nil {
95+
span.SetStatus(codes.Error, "failed to delete ssm doc")
96+
span.RecordError(err)
8297
return stacktrace.Propagate(err, "failed to delete ssm doc")
8398
}
8499

chaoslib/litmus/aws-ssm-chaos/lib/ssm/aws-ssm-chaos-by-tag.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/litmuschaos/litmus-go/pkg/utils/common"
2020
"github.com/palantir/stacktrace"
2121
"go.opentelemetry.io/otel"
22+
"go.opentelemetry.io/otel/codes"
2223
)
2324

2425
// PrepareAWSSSMChaosByTag contains the prepration and injection steps for the experiment
@@ -44,6 +45,8 @@ func PrepareAWSSSMChaosByTag(ctx context.Context, experimentsDetails *experiment
4445

4546
//create and upload the ssm document on the given aws service monitoring docs
4647
if err = ssm.CreateAndUploadDocument(experimentsDetails.DocumentName, experimentsDetails.DocumentType, experimentsDetails.DocumentFormat, experimentsDetails.DocumentPath, experimentsDetails.Region); err != nil {
48+
span.SetStatus(codes.Error, "could not create and upload the ssm document")
49+
span.RecordError(err)
4750
return stacktrace.Propagate(err, "could not create and upload the ssm document")
4851
}
4952
experimentsDetails.IsDocsUploaded = true
@@ -55,25 +58,37 @@ func PrepareAWSSSMChaosByTag(ctx context.Context, experimentsDetails *experiment
5558
log.Infof("[Chaos]:Number of Instance targeted: %v", len(instanceIDList))
5659

5760
if len(instanceIDList) == 0 {
58-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no instance id found for chaos injection"}
61+
span.SetStatus(codes.Error, "no instance id found for chaos injection")
62+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no instance id found for chaos injection"}
63+
span.RecordError(err)
64+
return err
5965
}
6066

6167
switch strings.ToLower(experimentsDetails.Sequence) {
6268
case "serial":
6369
if err = lib.InjectChaosInSerialMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
70+
span.SetStatus(codes.Error, "could not run chaos in serial mode")
71+
span.RecordError(err)
6472
return stacktrace.Propagate(err, "could not run chaos in serial mode")
6573
}
6674
case "parallel":
6775
if err = lib.InjectChaosInParallelMode(ctx, experimentsDetails, instanceIDList, clients, resultDetails, eventsDetails, chaosDetails, inject); err != nil {
76+
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
77+
span.RecordError(err)
6878
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
6979
}
7080
default:
71-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
81+
span.SetStatus(codes.Error, "sequence is not supported")
82+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
83+
span.RecordError(err)
84+
return err
7285
}
7386

7487
//Delete the ssm document on the given aws service monitoring docs
7588
err = ssm.SSMDeleteDocument(experimentsDetails.DocumentName, experimentsDetails.Region)
7689
if err != nil {
90+
span.SetStatus(codes.Error, "failed to delete ssm doc")
91+
span.RecordError(err)
7792
return stacktrace.Propagate(err, "failed to delete ssm doc")
7893
}
7994

chaoslib/litmus/azure-disk-loss/lib/azure-disk-loss.go

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/litmuschaos/litmus-go/pkg/utils/retry"
2525
"github.com/palantir/stacktrace"
2626
"go.opentelemetry.io/otel"
27+
"go.opentelemetry.io/otel/codes"
2728
)
2829

2930
var (
@@ -55,11 +56,16 @@ func PrepareChaos(ctx context.Context, experimentsDetails *experimentTypes.Exper
5556
//get the disk name or list of disk names
5657
diskNameList := strings.Split(experimentsDetails.VirtualDiskNames, ",")
5758
if experimentsDetails.VirtualDiskNames == "" || len(diskNameList) == 0 {
58-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no volume names found to detach"}
59+
span.SetStatus(codes.Error, "no volume names found to detach")
60+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no volume names found to detach"}
61+
span.RecordError(err)
62+
return err
5963
}
6064
instanceNamesWithDiskNames, err := diskStatus.GetInstanceNameForDisks(diskNameList, experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup)
6165

6266
if err != nil {
67+
span.SetStatus(codes.Error, "error fetching attached instances for disks")
68+
span.RecordError(err)
6369
return stacktrace.Propagate(err, "error fetching attached instances for disks")
6470
}
6571

@@ -69,6 +75,8 @@ func PrepareChaos(ctx context.Context, experimentsDetails *experimentTypes.Exper
6975
for instanceName := range instanceNamesWithDiskNames {
7076
attachedDisksWithInstance[instanceName], err = diskStatus.GetInstanceDiskList(experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup, experimentsDetails.ScaleSet, instanceName)
7177
if err != nil {
78+
span.SetStatus(codes.Error, "error fetching virtual disks")
79+
span.RecordError(err)
7280
return stacktrace.Propagate(err, "error fetching virtual disks")
7381
}
7482
}
@@ -85,14 +93,21 @@ func PrepareChaos(ctx context.Context, experimentsDetails *experimentTypes.Exper
8593
switch strings.ToLower(experimentsDetails.Sequence) {
8694
case "serial":
8795
if err = injectChaosInSerialMode(ctx, experimentsDetails, instanceNamesWithDiskNames, attachedDisksWithInstance, clients, resultDetails, eventsDetails, chaosDetails); err != nil {
96+
span.SetStatus(codes.Error, "could not run chaos in serial mode")
97+
span.RecordError(err)
8898
return stacktrace.Propagate(err, "could not run chaos in serial mode")
8999
}
90100
case "parallel":
91101
if err = injectChaosInParallelMode(ctx, experimentsDetails, instanceNamesWithDiskNames, attachedDisksWithInstance, clients, resultDetails, eventsDetails, chaosDetails); err != nil {
102+
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
103+
span.RecordError(err)
92104
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
93105
}
94106
default:
95-
return cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
107+
span.SetStatus(codes.Error, "sequence is not supported")
108+
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
109+
span.RecordError(err)
110+
return err
96111
}
97112

98113
//Waiting for the ramp time after chaos injection
@@ -125,6 +140,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
125140
log.Info("[Chaos]: Detaching the virtual disks from the instances")
126141
for instanceName, diskNameList := range instanceNamesWithDiskNames {
127142
if err = diskStatus.DetachDisks(experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup, instanceName, experimentsDetails.ScaleSet, diskNameList); err != nil {
143+
span.SetStatus(codes.Error, "failed to detach disks")
144+
span.RecordError(err)
128145
return stacktrace.Propagate(err, "failed to detach disks")
129146
}
130147
}
@@ -133,6 +150,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
133150
for _, diskName := range diskNameList {
134151
log.Infof("[Wait]: Waiting for Disk '%v' to detach", diskName)
135152
if err := diskStatus.WaitForDiskToDetach(experimentsDetails, diskName); err != nil {
153+
span.SetStatus(codes.Error, "disk detachment check failed")
154+
span.RecordError(err)
136155
return stacktrace.Propagate(err, "disk detachment check failed")
137156
}
138157
}
@@ -147,6 +166,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
147166
// run the probes during chaos
148167
if len(resultDetails.ProbeDetails) != 0 {
149168
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
169+
span.SetStatus(codes.Error, "failed to run probes")
170+
span.RecordError(err)
150171
return stacktrace.Propagate(err, "failed to run probes")
151172
}
152173
}
@@ -159,6 +180,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
159180
log.Info("[Chaos]: Attaching the Virtual disks back to the instances")
160181
for instanceName, diskNameList := range attachedDisksWithInstance {
161182
if err = diskStatus.AttachDisk(experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup, instanceName, experimentsDetails.ScaleSet, diskNameList); err != nil {
183+
span.SetStatus(codes.Error, "virtual disk attachment failed")
184+
span.RecordError(err)
162185
return stacktrace.Propagate(err, "virtual disk attachment failed")
163186
}
164187

@@ -167,6 +190,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
167190
for _, diskName := range diskNameList {
168191
log.Infof("[Wait]: Waiting for Disk '%v' to attach", diskName)
169192
if err := diskStatus.WaitForDiskToAttach(experimentsDetails, diskName); err != nil {
193+
span.SetStatus(codes.Error, "disk attachment check failed")
194+
span.RecordError(err)
170195
return stacktrace.Propagate(err, "disk attachment check failed")
171196
}
172197
}
@@ -209,12 +234,16 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
209234
// Detaching the virtual disks
210235
log.Infof("[Chaos]: Detaching %v from the instance", diskName)
211236
if err = diskStatus.DetachDisks(experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup, instanceName, experimentsDetails.ScaleSet, diskNameToList); err != nil {
237+
span.SetStatus(codes.Error, "failed to detach disks")
238+
span.RecordError(err)
212239
return stacktrace.Propagate(err, "failed to detach disks")
213240
}
214241

215242
// Waiting for disk to be detached
216243
log.Infof("[Wait]: Waiting for Disk '%v' to detach", diskName)
217244
if err := diskStatus.WaitForDiskToDetach(experimentsDetails, diskName); err != nil {
245+
span.SetStatus(codes.Error, "disk detachment check failed")
246+
span.RecordError(err)
218247
return stacktrace.Propagate(err, "disk detachment check failed")
219248
}
220249

@@ -224,6 +253,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
224253
// the OnChaos probes execution will start in the first iteration and keep running for the entire chaos duration
225254
if len(resultDetails.ProbeDetails) != 0 && i == 0 {
226255
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
256+
span.SetStatus(codes.Error, "failed to run probes")
257+
span.RecordError(err)
227258
return stacktrace.Propagate(err, "failed to run probes")
228259
}
229260
}
@@ -235,12 +266,16 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
235266
//Attaching the virtual disks to the instance
236267
log.Infof("[Chaos]: Attaching %v back to the instance", diskName)
237268
if err = diskStatus.AttachDisk(experimentsDetails.SubscriptionID, experimentsDetails.ResourceGroup, instanceName, experimentsDetails.ScaleSet, attachedDisksWithInstance[instanceName]); err != nil {
269+
span.SetStatus(codes.Error, "disk attachment failed")
270+
span.RecordError(err)
238271
return stacktrace.Propagate(err, "disk attachment failed")
239272
}
240273

241274
// Waiting for disk to be attached
242275
log.Infof("[Wait]: Waiting for Disk '%v' to attach", diskName)
243276
if err := diskStatus.WaitForDiskToAttach(experimentsDetails, diskName); err != nil {
277+
span.SetStatus(codes.Error, "disk attachment check failed")
278+
span.RecordError(err)
244279
return stacktrace.Propagate(err, "disk attachment check failed")
245280
}
246281

0 commit comments

Comments
 (0)