Skip to content

Commit 48ce0bd

Browse files
committed
Allow user to select whether they want the watchdog CloudWatch alert to be sent via a Lamdba function.
1 parent 6426420 commit 48ce0bd

File tree

2 files changed

+89
-50
lines changed

2 files changed

+89
-50
lines changed

Monitoring/monitor-ontap-services/cloudformation.yaml

Lines changed: 85 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,17 @@ Metadata:
1818
- secretPasswordKey
1919
- checkInterval
2020
- createWatchdogAlarm
21+
- implementWatchdogAsLambda
22+
- watchdogRoleArn
23+
- LambdaRoleArn
24+
- SchedulerRoleArn
2125
- createSecretsManagerEndpoint
2226
- createSNSEndpoint
2327
- createCloudWatchLogsEndpoint
2428
- createS3Endpoint
2529
- routeTableIds
2630
- vpcId
2731
- endpointSecurityGroupIds
28-
- LambdaRoleArn
29-
- SchedulerRoleArn
30-
- watchdogRoleArn
3132
- Label:
3233
default: "Alert Parameters"
3334
Parameters:
@@ -94,13 +95,19 @@ Parameters:
9495
Default: "password"
9596

9697
createWatchdogAlarm:
97-
Description: "Create a CloudWatch alarm to monitor the Lambda function. It will alert you if the function fails to run successfully."
98+
Description: "Create a CloudWatch alarm to monitor the Lambda function. It will alert you if the monitoring Lambda function fails to run successfully."
9899
Type: String
99100
Default: "true"
100101
AllowedValues: ["true", "false"]
101102

103+
implementWatchdogAsLambda:
104+
Description: "Use a Lambda function to publish to the SNS topic so it can reside in a different region. Only needed if you are creating the CloudWatch alarm and the SNS topic is in a different region."
105+
Type: String
106+
Default: "false"
107+
AllowedValues: ["true", "false"]
108+
102109
watchdogRoleArn:
103-
Description: "The ARN of the role to use for the Lambda function that will publish messages to the SNS topic if the monitoring function doesn't run properly. This is only needed if you are having the CloudWatch alarm created and if you want to provide an existing role, otherwise an appropriate one will be created for you."
110+
Description: "The ARN of the role to assign to the Lambda function that will publish messages to the SNS topic if the monitoring function doesn't run properly. This is only needed if you are having the CloudWatch alarm created, implemented as a Lambda function and you want to provide an existing role, otherwise, if needed, an appropriate role will be created for you."
104111
Type: String
105112
Default: ""
106113

@@ -192,7 +199,7 @@ Parameters:
192199
Description: "Alert when a SnapMirror update hasn't transferred any new data in the specified seconds. Set to 0 to disable this alert."
193200
Type: Number
194201
Default: 600
195-
MinValue: 60
202+
MinValue: 0
196203

197204
snapMirrorHealthAlert:
198205
Description: "Alert when the SnapMirror relationship is not healthy."
@@ -201,34 +208,40 @@ Parameters:
201208
AllowedValues: ["true", "false"]
202209

203210
fileSystemUtilizationWarnAlert:
204-
Description: "Alert when the file system utilization exceeds this threshold in percentage."
211+
Description: "Alert when the file system utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
205212
Type: Number
206213
Default: 80
214+
MinValue: 0
207215

208216
fileSystemUtilizationCriticalAlert:
209-
Description: "Alert when the file system utilization exceeds this threshold in percentage."
217+
Description: "Alert when the file system utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
210218
Type: Number
211219
Default: 90
220+
MinValue: 0
212221

213222
volumeUtilizationWarnAlert:
214-
Description: "Alert when a volume utilization exceeds this threshold in percentage."
223+
Description: "Alert when a volume utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
215224
Type: Number
216225
Default: 90
226+
MinValue: 0
217227

218228
volumeUtilizationCriticalAlert:
219-
Description: "Alert when a volume utilization exceeds this threshold in percentage."
229+
Description: "Alert when a volume utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
220230
Type: Number
221231
Default: 95
232+
MinValue: 0
222233

223234
volumeFileUtilizationWarnAlert:
224-
Description: "Alert when a volume inode utilization exceeds this threshold in percentage."
235+
Description: "Alert when a volume inode utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
225236
Type: Number
226237
Default: 90
238+
MinValue: 0
227239

228240
volumeFileUtilizationCriticalAlert:
229-
Description: "Alert when a volume inode utilization exceeds this threshold in percentage."
241+
Description: "Alert when a volume inode utilization exceeds this threshold in percentage. Set to 0 to disable this alert."
230242
Type: Number
231243
Default: 95
244+
MinValue: 0
232245

233246
volumeOfflineAlert:
234247
Description: "Alert when a volume goes offline."
@@ -237,19 +250,22 @@ Parameters:
237250
Default: "true"
238251

239252
softQuotaUtilizationAlert:
240-
Description: "Alert when a soft quota exceeds this threshold in percentage."
253+
Description: "Alert when a soft quota exceeds this threshold in percentage. Set to 0 to disable this alert."
241254
Type: Number
242255
Default: 100
256+
MinValue: 0
243257

244258
hardQuotaUtilizationAlert:
245-
Description: "Alert when a hard quota exceeds this threshold in percentage."
259+
Description: "Alert when a hard quota exceeds this threshold in percentage. Set to 0 to disable this alert."
246260
Type: Number
247261
Default: 80
262+
MinValue: 0
248263

249264
inodesQuotaUtilizationAlert:
250-
Description: "Alert when an inode quota exceeds this threshold in percentage."
265+
Description: "Alert when an inode quota exceeds this threshold in percentage. Set to 0 to disable this alert."
251266
Type: Number
252267
Default: 80
268+
MinValue: 0
253269

254270
vserverStateAlert:
255271
Description: "Alert when a vserver goes offline."
@@ -274,11 +290,12 @@ Conditions:
274290
CreateSNSEndpoint: !Equals [!Ref createSNSEndpoint, "true"]
275291
CreateS3Endpoint: !Equals [!Ref createS3Endpoint, "true"]
276292
CreateCloudWatchLogsEndpoint: !Equals [!Ref createCloudWatchLogsEndpoint, "true"]
277-
CreateWatchdogAlarm: !Equals [!Ref createWatchdogAlarm, "true"]
293+
CreateWatchdogAlarmAsLambda: !And [!Equals [!Ref createWatchdogAlarm, "true"], !Equals [!Ref implementWatchdogAsLambda, "true"]]
294+
CreateWatchdogAlarmAsCloudWatch: !And [!Equals [!Ref createWatchdogAlarm, "true"], !Equals [!Ref implementWatchdogAsLambda, "false"]]
295+
CreateWatchdogRole: !And [!Equals [!Ref watchdogRoleArn, ""], !Equals [!Ref implementWatchdogAsLambda, "true"]]
278296
CreateLambdaRoleWithCW: !And [!Equals [!Ref LambdaRoleArn, ""], !Not [!Equals [!Ref cloudWatchLogGroupArn, ""]]]
279297
CreateLambdaRoleWithoutCW: !And [!Equals [!Ref LambdaRoleArn, ""], !Equals [!Ref cloudWatchLogGroupArn, ""]]
280298
CreateSchedulerRole: !Equals [!Ref SchedulerRoleArn, ""]
281-
CreateWatchdogRole: !Equals [!Ref watchdogRoleArn, ""]
282299

283300
Resources:
284301
SecretManagerEndpoint:
@@ -323,7 +340,7 @@ Resources:
323340
VpcEndpointType: 'Gateway'
324341
RouteTableIds: !Ref routeTableIds
325342
#
326-
# Allow the Watchdog Lambda function to publish to the SNS topic.
343+
# Role used by the watchdog Lambda function to publish to the SNS topic.
327344
LambdaRoleWatchdog:
328345
Type: "AWS::IAM::Role"
329346
Condition: CreateWatchdogRole
@@ -351,17 +368,17 @@ Resources:
351368
# This allows the Watchdog CloudWatch alarm to invoke the Lambda function.
352369
resourceBasedPermission:
353370
Type: "AWS::Lambda::Permission"
354-
Condition: CreateWatchdogAlarm
371+
Condition: CreateWatchdogAlarmAsLambda
355372
Properties:
356373
Action: "lambda:InvokeFunction"
357374
FunctionName: !Sub "monitor-ontap-services-watchdog-${AWS::StackName}"
358375
Principal: "lambda.alarms.cloudwatch.amazonaws.com"
359-
SourceArn: !GetAtt watchdogAlarm.Arn
376+
SourceArn: !GetAtt watchdogAlarmToLambda.Arn
360377
#
361378
# Use a Lambda function to publish to an SNS topic so it can reside in a different region.
362379
watchdogLambdaFunction:
363380
Type: "AWS::Lambda::Function"
364-
Condition: CreateWatchdogAlarm
381+
Condition: CreateWatchdogAlarmAsLambda
365382
Properties:
366383
FunctionName: !Sub "monitor-ontap-services-watchdog-${AWS::StackName}"
367384
PackageType: "Zip"
@@ -380,36 +397,21 @@ Resources:
380397
def lambda_handler(event, context):
381398
snsTopicArn = os.environ.get('snsTopicArn')
382399
if snsTopicArn is not None:
383-
region = snsTopicArn.split(":")[3]
384-
snsClient = boto3.client('sns', region_name=region)
385-
#
386-
# This is for future developement when the monitor-ontap-services
387-
# Lambda function will be able to send messages to the SNS topic.
388-
cmd = event.get("cmd")
389-
#
390-
# If the cmd is None, then assume a CloudWatch alarm triggered this function.
391-
if cmd is None:
392-
message = f'Error! Lambda function {event["alarmData"]["alarmName"].replace("-watchdog-", "")} failed to execute properly.'
393-
snsClient.publish(
394-
TopicArn = snsTopicArn,
395-
Subject = 'Error! Monitoring ONTAP services has failed to execute',
396-
Message = message
397-
)
398-
elif cmd == "sendSns":
399-
message = event.get("message")
400-
subject = event.get("subject")
401-
snsClient.publish(
402-
TopicArn = snsTopicArn,
403-
Subject = subject,
404-
Message = message
405-
)
400+
snsClient = boto3.client('sns', region_name=snsTopicArn.split(":")[3])
401+
snsClient.publish(
402+
TopicArn = snsTopicArn,
403+
Subject = 'Error! Monitoring ONTAP services has failed to execute',
404+
Message = f'Error! Lambda function {event["alarmData"]["alarmName"].replace("-watchdog-", "")} failed to execute properly.'
405+
)
406406
#
407407
# This is the CloudWatch alarm that will trigger when the monitor-ontap-services
408408
# Lambda function fails to run successfully. It will invoke the watchdogLambdaFunction
409409
# to send a message to the SNS topic.
410-
watchdogAlarm:
410+
# Only this alarm, or the watchdogAlarmToSNS, will be created depending on the
411+
# implementWatchdogAsLambda parameter.
412+
watchdogAlarmToLambda:
411413
Type: "AWS::CloudWatch::Alarm"
412-
Condition: CreateWatchdogAlarm
414+
Condition: CreateWatchdogAlarmAsLambda
413415
Properties:
414416
AlarmName: !Sub "monitor-ontap-services-watchdog-${AWS::StackName}"
415417
AlarmDescription: !Sub "Watchdog alarm for the monitor-ontap-services-${AWS::StackName} Lambda function."
@@ -426,7 +428,35 @@ Resources:
426428
ComparisonOperator: "GreaterThanThreshold"
427429
AlarmActions:
428430
- !GetAtt watchdogLambdaFunction.Arn
429-
431+
#
432+
# This is the CloudWatch alarm that will trigger when the monitor-ontap-services
433+
# Lambda function fails to run successfully. It will send an SNS message to the SNS topic.
434+
# Only this alarm, or the watchdogAlarmToLambda, will be created depending on the
435+
# implementWatchdogAsLambda parameter.
436+
watchdogAlarmToSNS:
437+
Type: "AWS::CloudWatch::Alarm"
438+
Condition: CreateWatchdogAlarmAsCloudWatch
439+
Properties:
440+
AlarmName: !Sub "monitor-ontap-services-watchdog-${AWS::StackName}"
441+
AlarmDescription: !Sub "Watchdog alarm for the monitor-ontap-services-${AWS::StackName} Lambda function."
442+
Namespace: "AWS/Lambda"
443+
MetricName: "Errors"
444+
Dimensions:
445+
- Name: "FunctionName"
446+
Value: !Sub "monitor-ontap-services-${AWS::StackName}"
447+
Statistic: "Maximum"
448+
Period: 300
449+
EvaluationPeriods: 1
450+
TreatMissingData: "ignore"
451+
Threshold: 0.5
452+
ComparisonOperator: "GreaterThanThreshold"
453+
AlarmActions:
454+
- !Ref snsTopicArn
455+
#
456+
# This is the role that will be assigned to the monitoring Lambda function if
457+
# the user doesn't want to send events to a CloudWatch log group. It will not
458+
# create if the user has provide a role ARN. Either this role, or the LambdaRoleWithCW
459+
# will be created depending on the createCloudWatchLogGroupArn parameter.
430460
LambdaRoleWithoutCW:
431461
Type: "AWS::IAM::Role"
432462
Condition: CreateLambdaRoleWithoutCW
@@ -460,7 +490,11 @@ Resources:
460490
- !Ref snsTopicArn
461491
- !Sub "arn:aws:s3:::${s3BucketName}"
462492
- !Sub "arn:aws:s3:::${s3BucketName}/*"
463-
493+
#
494+
# This is the role that will be assigned to the monitoring Lambda function if
495+
# the user doesn't want to send events to a CloudWatch log group. It will not
496+
# create if the user has provide a role ARN. Either this role, or the LambdaRoleWithCW
497+
# will be created depending on the createCloudWatchLogGroupArn parameter.
464498
LambdaRoleWithCW:
465499
Type: "AWS::IAM::Role"
466500
Condition: CreateLambdaRoleWithCW
@@ -498,7 +532,9 @@ Resources:
498532
- !Sub "arn:aws:s3:::${s3BucketName}"
499533
- !Sub "arn:aws:s3:::${s3BucketName}/*"
500534
- !Ref cloudWatchLogGroupArn
501-
535+
#
536+
# This is the role that will be assigned to the scheduler EventBridge if the user
537+
# doesn't want to provide a role ARN.
502538
SchedulerRole:
503539
Type: "AWS::IAM::Role"
504540
Condition: CreateSchedulerRole

Monitoring/monitor-ontap-services/monitor_ontap_services.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1293,7 +1293,10 @@ def buildDefaultMatchingConditions():
12931293
conditions["services"][getServiceIndex("systemHealth", conditions)]["rules"].append({"networkInterfaces": False})
12941294
elif name == "initialEmsEventsAlert":
12951295
if value == "true":
1296-
conditions["services"][getServiceIndex("ems", conditions)]["rules"].append({"name": "", "severity": "error|alert|emergency", "message": ""})
1296+
if os.environ.get("initialEmsExtendedAlerts") == "true":
1297+
conditions["services"][getServiceIndex("ems", conditions)]["rules"].append({"name": "", "severity": "informational|notice|error|alert|emergency", "message": ""})
1298+
else:
1299+
conditions["services"][getServiceIndex("ems", conditions)]["rules"].append({"name": "", "severity": "error|alert|emergency", "message": ""})
12971300
elif name == "initialSnapMirrorHealthAlert":
12981301
if value == "true":
12991302
conditions["services"][getServiceIndex("snapmirror", conditions)]["rules"].append({"Healthy": False}) # This is what it matches on, so it is interesting when the health is false.

0 commit comments

Comments
 (0)