Skip to content

Commit 8153dcc

Browse files
committed
First move to new result return type for investigations
1 parent 3b86544 commit 8153dcc

File tree

6 files changed

+98
-64
lines changed

6 files changed

+98
-64
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ func run(_ *cobra.Command, _ []string) error {
126126
investigationResources := &investigation.Resources{Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
127127

128128
logging.Infof("Starting investigation for %s", alertInvestigation.Name)
129-
return alertInvestigation.Run(investigationResources)
129+
result, err := alertInvestigation.Run(investigationResources)
130+
updateMetrics(&result)
131+
return err
130132
}
131133

132134
// GetOCMClient will retrieve the OcmClient from the 'ocm' package
@@ -170,3 +172,15 @@ func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.Sdk
170172
}
171173
return true, nil
172174
}
175+
176+
func updateMetrics(result *investigation.InvestigationResult) {
177+
if result.ServiceLogSent.Performed {
178+
metrics.Inc(metrics.ServicelogSent, append([]string{result.InvestigationName}, result.ServiceLogSent.Labels...)...)
179+
}
180+
if result.ServiceLogPrepared.Performed {
181+
metrics.Inc(metrics.ServicelogPrepared, append([]string{result.InvestigationName}, result.ServiceLogPrepared.Labels...)...)
182+
}
183+
if result.LimitedSupportSet.Performed {
184+
metrics.Inc(metrics.LimitedSupportSet, append([]string{result.InvestigationName}, result.LimitedSupportSet.Labels...)...)
185+
}
186+
}

pkg/investigations/chgm/chgm.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"github.com/openshift/configuration-anomaly-detection/pkg/aws"
1111
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13-
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
1413
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
1514
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
1615
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
@@ -40,21 +39,23 @@ var (
4039
)
4140

4241
// Investigate runs the investigation for a triggered chgm pagerduty event
43-
func Investigate(r *investigation.Resources) error {
42+
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
43+
result := investigation.InvestigationResult{InvestigationName: investigationName}
4444
notes := notewriter.New("CHGM", logging.RawLogger)
4545

4646
// 1. Check if the user stopped instances
4747
res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient)
4848
if err != nil {
49-
return r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
49+
return result, r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
5050
}
5151
logging.Debugf("the investigation returned: [infras running: %d] - [masters running: %d]", res.RunningInstances.Infra, res.RunningInstances.Master)
5252

5353
if !res.UserAuthorized {
5454
logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName)
55-
return utils.WithRetries(func() error {
55+
return result, utils.WithRetries(func() error {
5656
err := postChgmSLAndSilence(r.Cluster.ID(), r.OcmClient, r.PdClient)
57-
metrics.Inc(metrics.ServicelogSent, investigationName)
57+
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
58+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
5859

5960
return err
6061
})
@@ -89,21 +90,23 @@ func Investigate(r *investigation.Resources) error {
8990
if strings.Contains(failureReason, "nosnch.in") {
9091
err := r.OcmClient.PostLimitedSupportReason(&egressLS, r.Cluster.ID())
9192
if err != nil {
92-
return err
93+
return result, err
9394
}
9495

95-
metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
96+
// XXX: metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
97+
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}}
9698

9799
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
98-
return r.PdClient.SilenceIncidentWithNote(notes.String())
100+
return result, r.PdClient.SilenceIncidentWithNote(notes.String())
99101
}
100102

101103
err := r.OcmClient.PostServiceLog(r.Cluster.ID(), createEgressSL(failureReason))
102104
if err != nil {
103-
return err
105+
return result, err
104106
}
105107

106-
metrics.Inc(metrics.ServicelogSent, investigationName)
108+
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
109+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
107110

108111
notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
109112
case networkverifier.Success:
@@ -112,7 +115,7 @@ func Investigate(r *investigation.Resources) error {
112115
}
113116

114117
// Found no issues that CAD can handle by itself - forward notes to SRE.
115-
return r.PdClient.EscalateIncidentWithNote(notes.String())
118+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
116119
}
117120

118121
// investigateHibernation checks if the cluster was recently woken up from
@@ -440,4 +443,4 @@ func postChgmSLAndSilence(clusterID string, ocmCli ocm.Client, pdCli pagerduty.C
440443
}
441444

442445
return pdCli.SilenceIncidentWithNote("Customer stopped instances. Sent SL and silencing alert.")
443-
}
446+
}

pkg/investigations/chgm/chgm_test.go

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ var _ = Describe("chgm", func() {
107107
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
108108
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any())
109109

110-
gotErr := Investigate(r)
110+
_, gotErr := Investigate(r)
111111

112112
Expect(gotErr).NotTo(HaveOccurred())
113113
})
@@ -126,7 +126,7 @@ var _ = Describe("chgm", func() {
126126
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
127127
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any())
128128

129-
gotErr := Investigate(r)
129+
_, gotErr := Investigate(r)
130130

131131
Expect(gotErr).NotTo(HaveOccurred())
132132
})
@@ -136,7 +136,7 @@ var _ = Describe("chgm", func() {
136136
r.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Any()).Return(nil, fakeErr)
137137
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
138138

139-
gotErr := Investigate(r)
139+
_, gotErr := Investigate(r)
140140

141141
Expect(gotErr).NotTo(HaveOccurred())
142142
})
@@ -151,7 +151,7 @@ var _ = Describe("chgm", func() {
151151
r.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil)
152152
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
153153
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
154-
gotErr := Investigate(r)
154+
_, gotErr := Investigate(r)
155155
// Assert
156156
Expect(gotErr).ToNot(HaveOccurred())
157157
})
@@ -165,7 +165,7 @@ var _ = Describe("chgm", func() {
165165

166166
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
167167

168-
gotErr := Investigate(r)
168+
_, gotErr := Investigate(r)
169169
Expect(gotErr).ToNot(HaveOccurred())
170170
})
171171
})
@@ -179,7 +179,7 @@ var _ = Describe("chgm", func() {
179179
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
180180

181181
// Act
182-
gotErr := Investigate(r)
182+
_, gotErr := Investigate(r)
183183
Expect(gotErr).ToNot(HaveOccurred())
184184
})
185185
})
@@ -193,7 +193,7 @@ var _ = Describe("chgm", func() {
193193
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
194194
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
195195
// Act
196-
gotErr := Investigate(r)
196+
_, gotErr := Investigate(r)
197197
// Assert
198198
Expect(gotErr).NotTo(HaveOccurred())
199199
})
@@ -212,7 +212,7 @@ var _ = Describe("chgm", func() {
212212
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
213213
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
214214

215-
gotErr := Investigate(r)
215+
_, gotErr := Investigate(r)
216216
Expect(gotErr).NotTo(HaveOccurred())
217217
})
218218
})
@@ -229,7 +229,7 @@ var _ = Describe("chgm", func() {
229229
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
230230
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
231231

232-
gotErr := Investigate(r)
232+
_, gotErr := Investigate(r)
233233
Expect(gotErr).NotTo(HaveOccurred())
234234
})
235235
})
@@ -247,7 +247,7 @@ var _ = Describe("chgm", func() {
247247
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
248248
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Any(), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
249249

250-
gotErr := Investigate(r)
250+
_, gotErr := Investigate(r)
251251
Expect(gotErr).NotTo(HaveOccurred())
252252
})
253253
})
@@ -265,7 +265,7 @@ var _ = Describe("chgm", func() {
265265
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
266266
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
267267

268-
gotErr := Investigate(r)
268+
_, gotErr := Investigate(r)
269269
Expect(gotErr).NotTo(HaveOccurred())
270270
})
271271
})
@@ -282,7 +282,7 @@ var _ = Describe("chgm", func() {
282282
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
283283
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
284284

285-
gotErr := Investigate(r)
285+
_, gotErr := Investigate(r)
286286
Expect(gotErr).NotTo(HaveOccurred())
287287
})
288288
})
@@ -299,7 +299,7 @@ var _ = Describe("chgm", func() {
299299
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
300300
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
301301

302-
gotErr := Investigate(r)
302+
_, gotErr := Investigate(r)
303303
Expect(gotErr).NotTo(HaveOccurred())
304304
})
305305
})
@@ -316,7 +316,7 @@ var _ = Describe("chgm", func() {
316316
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
317317
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
318318

319-
gotErr := Investigate(r)
319+
_, gotErr := Investigate(r)
320320
Expect(gotErr).NotTo(HaveOccurred())
321321
})
322322
})
@@ -333,7 +333,7 @@ var _ = Describe("chgm", func() {
333333
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
334334
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
335335

336-
gotErr := Investigate(r)
336+
_, gotErr := Investigate(r)
337337
Expect(gotErr).NotTo(HaveOccurred())
338338
})
339339
})
@@ -350,7 +350,7 @@ var _ = Describe("chgm", func() {
350350
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
351351
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
352352

353-
gotErr := Investigate(r)
353+
_, gotErr := Investigate(r)
354354
Expect(gotErr).NotTo(HaveOccurred())
355355
})
356356
})
@@ -367,7 +367,7 @@ var _ = Describe("chgm", func() {
367367
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
368368
r.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil)
369369

370-
gotErr := Investigate(r)
370+
_, gotErr := Investigate(r)
371371
Expect(gotErr).NotTo(HaveOccurred())
372372
})
373373
})
@@ -381,7 +381,7 @@ var _ = Describe("chgm", func() {
381381
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
382382
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
383383

384-
gotErr := Investigate(r)
384+
_, gotErr := Investigate(r)
385385
Expect(gotErr).NotTo(HaveOccurred())
386386
})
387387
})
@@ -395,7 +395,7 @@ var _ = Describe("chgm", func() {
395395
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
396396
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
397397

398-
gotErr := Investigate(r)
398+
_, gotErr := Investigate(r)
399399
Expect(gotErr).NotTo(HaveOccurred())
400400
})
401401
})
@@ -409,7 +409,7 @@ var _ = Describe("chgm", func() {
409409
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
410410
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
411411

412-
gotErr := Investigate(r)
412+
_, gotErr := Investigate(r)
413413
Expect(gotErr).NotTo(HaveOccurred())
414414
})
415415
})
@@ -422,7 +422,7 @@ var _ = Describe("chgm", func() {
422422
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
423423
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
424424

425-
gotErr := Investigate(r)
425+
_, gotErr := Investigate(r)
426426
Expect(gotErr).NotTo(HaveOccurred())
427427
})
428428
})
@@ -437,7 +437,7 @@ var _ = Describe("chgm", func() {
437437
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
438438
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
439439

440-
gotErr := Investigate(r)
440+
_, gotErr := Investigate(r)
441441
Expect(gotErr).NotTo(HaveOccurred())
442442
})
443443
})
@@ -452,7 +452,7 @@ var _ = Describe("chgm", func() {
452452
r.OcmClient.(*ocmmock.MockClient).EXPECT().PostServiceLog(gomock.Eq(cluster.ID()), gomock.Eq(&chgmSL)).Return(nil)
453453
r.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil)
454454

455-
gotErr := Investigate(r)
455+
_, gotErr := Investigate(r)
456456
Expect(gotErr).NotTo(HaveOccurred())
457457
})
458458
})
@@ -467,7 +467,7 @@ var _ = Describe("chgm", func() {
467467
r.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil)
468468
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
469469

470-
gotErr := Investigate(r)
470+
_, gotErr := Investigate(r)
471471
Expect(gotErr).NotTo(HaveOccurred())
472472
})
473473
})
@@ -482,7 +482,7 @@ var _ = Describe("chgm", func() {
482482
r.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{}, nil)
483483
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
484484

485-
gotErr := Investigate(r)
485+
_, gotErr := Investigate(r)
486486
Expect(gotErr).NotTo(HaveOccurred())
487487
})
488488
})
@@ -497,7 +497,7 @@ var _ = Describe("chgm", func() {
497497
r.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil)
498498
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
499499

500-
gotErr := Investigate(r)
500+
_, gotErr := Investigate(r)
501501
Expect(gotErr).NotTo(HaveOccurred())
502502
})
503503
})
@@ -512,7 +512,7 @@ var _ = Describe("chgm", func() {
512512
r.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil)
513513
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
514514

515-
gotErr := Investigate(r)
515+
_, gotErr := Investigate(r)
516516
Expect(gotErr).NotTo(HaveOccurred())
517517
})
518518
})
@@ -527,9 +527,9 @@ var _ = Describe("chgm", func() {
527527
r.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil)
528528
r.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil)
529529

530-
gotErr := Investigate(r)
530+
_, gotErr := Investigate(r)
531531
Expect(gotErr).NotTo(HaveOccurred())
532532
})
533533
})
534534
})
535-
})
535+
})

0 commit comments

Comments
 (0)