Skip to content

Commit d334112

Browse files
committed
Added a -F option to limit alarms to just one file system. Also, change the rate limitting algorithm used by boto3.
1 parent f34fc47 commit d334112

File tree

1 file changed

+71
-33
lines changed

1 file changed

+71
-33
lines changed

Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py

Lines changed: 71 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#
2727
################################################################################
2828
#
29-
# The following variable effect the behavior of the script. They can be
29+
# The following variables effect the behavior of the script. They can be
3030
# either be set here, overridden via the command line options, or
3131
# overridden by environment variables.
3232
#
@@ -58,9 +58,9 @@
5858
#
5959
################################################################################
6060
# You can't change the following variables from the command line or environment
61-
# variables since changing them after the program has run once, would cause
61+
# variables since changing them after the program has run once would cause
6262
# all existing CloudWatch alarms to be abandoned, and all new alarms to be
63-
# created. So it is not recommended to change these variables unless you know
63+
# created. So, it is not recommended to change these variables unless you know
6464
# what you are doing.
6565
################################################################################
6666
#
@@ -78,6 +78,7 @@
7878
################################################################################
7979

8080
import botocore
81+
from botocore.config import Config
8182
import boto3
8283
import os
8384
import getopt
@@ -217,13 +218,14 @@ def getAlarmThresholdTagValue(fsx, arn):
217218
except botocore.exceptions.ClientError as e:
218219
if e.response['Error']['Code'] == 'ResourceNotFound':
219220
return(100) # Return 100 so we don't try to create an alarm.
220-
elif e.response['Error']['Code'] == 'TooManyRequestsException':
221+
elif e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
221222
sleep = sleep * 2
222223
if sleep > 5:
223224
raise e
224-
print(f"Sleeping for {sleep} seconds.")
225+
print(f"Warning: Rate Limit fault while getting tags. Sleeping for {sleep} seconds.")
225226
time.sleep(sleep)
226227
else:
228+
print(f"boto3 client error: {json.dumps(e.response)}")
227229
raise e
228230

229231
################################################################################
@@ -248,6 +250,19 @@ def getSSDAlarmThresholdTagValue(tags):
248250
return(tag['Value'])
249251
return(defaultSSDThreshold)
250252

253+
################################################################################
254+
# This function returns the file system id that the passed in alarm is
255+
# associated with.
256+
################################################################################
257+
def getFileSystemId(alarm):
258+
259+
for metric in alarm['Metrics']:
260+
if metric["Id"] == "m1":
261+
for dim in metric['MetricStat']['Metric']['Dimensions']:
262+
if dim['Name'] == 'FileSystemId':
263+
return dim['Value']
264+
return None
265+
251266
################################################################################
252267
# This function will return all the file systems in the region. It will handle the
253268
# case where there are more file systms than can be returned in a single call.
@@ -265,13 +280,14 @@ def getFss(fsx):
265280
sleep=.125
266281
break
267282
except botocore.exceptions.ClientError as e:
268-
if e.response['Error']['Code'] == 'TooManyRequestsException':
283+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
269284
sleep = sleep * 2 # Exponential backoff.
270285
if sleep > 5:
271286
raise e
272-
print(f"Sleeping for {sleep} seconds for initial file systems.")
287+
print(f"Warning: Rate Limit fault while getting initial file system list. Sleeping for {sleep} seconds.")
273288
time.sleep(sleep)
274289
else:
290+
print(f"boto3 client error: {json.dumps(e.response)}")
275291
raise e
276292

277293
while nextToken:
@@ -281,13 +297,14 @@ def getFss(fsx):
281297
nextToken = response.get('NextToken')
282298
sleep=.125
283299
except botocore.exceptions.ClientError as e:
284-
if e.response['Error']['Code'] == 'TooManyRequestsException':
300+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
285301
sleep = sleep * 2 # Exponential backoff.
286302
if sleep > 5:
287303
raise e
288-
print(f"Sleeping for {sleep} seconds for additional file systems.")
304+
print(f"Warning: Rate Limit fault while getting additional file systems. Sleeping for {sleep} seconds.")
289305
time.sleep(sleep)
290306
else:
307+
print(f"boto3 client error: {json.dumps(e.response)}")
291308
raise e
292309
return fss
293310

@@ -297,7 +314,7 @@ def getFss(fsx):
297314
# It will also handle the case where we get a rate limit exception.
298315
################################################################################
299316
def getVolumes(fsx):
300-
317+
#
301318
# The initial amount of time to sleep if there is a rate limit exception.
302319
sleep=.125
303320
while True:
@@ -308,13 +325,14 @@ def getVolumes(fsx):
308325
sleep=.125
309326
break
310327
except botocore.exceptions.ClientError as e:
311-
if e.response['Error']['Code'] == 'TooManyRequestsException':
328+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
312329
sleep = sleep * 2 # Exponential backoff.
313330
if sleep > 5:
314331
raise e
315-
print(f"Sleeping for {sleep} seconds for initial volumes.")
332+
print(f"Warning: Rate Limit fault while getting the initial list of volumes. Sleeping for {sleep} seconds.")
316333
time.sleep(sleep)
317334
else:
335+
print(f"boto3 client error: {json.dumps(e.response)}")
318336
raise e
319337

320338
while nextToken:
@@ -324,13 +342,14 @@ def getVolumes(fsx):
324342
nextToken = response.get('NextToken')
325343
sleep=.125
326344
except botocore.exceptions.ClientError as e:
327-
if e.response['Error']['Code'] == 'TooManyRequestsException':
345+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
328346
sleep = sleep * 2 # Exponential backoff.
329347
if sleep > 5:
330348
raise e
331-
print(f"Sleeping for {sleep} seconds for additional volumes.")
349+
print(f"Warning: Rate Limit fault while getting additional volumes. Sleeping for {sleep} seconds.")
332350
time.sleep(sleep)
333351
else:
352+
print(f"boto3 client error: {json.dumps(e.response)}")
334353
raise e
335354

336355
return volumes
@@ -352,13 +371,14 @@ def getAlarms(cw):
352371
sleep=.125
353372
break
354373
except botocore.exceptions.ClientError as e:
355-
if e.response['Error']['Code'] == 'TooManyRequestsException':
374+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
356375
sleep = sleep * 2
357376
if sleep > 5:
358377
raise e
359-
print(f"Sleeping for {sleep} seconds for initial alarms.")
378+
print(f"Warning: Rate Limit fault while getting the initial list of alarms. Sleeping for {sleep} seconds.")
360379
time.sleep(sleep)
361380
else:
381+
print(f"boto3 client error: {json.dumps(e.response)}")
362382
raise e
363383

364384
while nextToken:
@@ -368,13 +388,14 @@ def getAlarms(cw):
368388
nextToken = response.get('NextToken')
369389
sleep=.125
370390
except botocore.exceptions.ClientError as e:
371-
if e.response['Error']['Code'] == 'TooManyRequestsException':
391+
if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException':
372392
sleep = sleep * 2 # Exponential backoff.
373393
if sleep > 5:
374394
raise e
375-
print(f"Sleeping for {sleep} seconds for additional alarms.")
395+
print(f"Warning: Rate Limit fault while getting additional alarms. Sleeping for {sleep} seconds.")
376396
time.sleep(sleep)
377397
else:
398+
print(f"boto3 client error: {json.dumps(e.response)}")
378399
raise e
379400

380401
return alarms
@@ -385,9 +406,9 @@ def getAlarms(cw):
385406
# have a CloudWatch alarm, and if not, add one.
386407
################################################################################
387408
def lambda_handler(event, context):
388-
global customerId, regions, SNStopic, accountId
409+
global customerId, regions, SNStopic, accountId, onlyFilesystemId
389410
#
390-
# If the customer ID is set, reformat to be used in the alarm description.
411+
# If the customer ID is set, reformat it to be used in the alarm description.
391412
if customerId != '':
392413
customerId = f", CustomerID: {customerId}"
393414

@@ -396,9 +417,17 @@ def lambda_handler(event, context):
396417

397418
if len(accountId) == 0:
398419
raise Exception("You must specify an accountId to run this program.")
420+
#
421+
# Configure boto3 to use the more advanced "adaptive" retry method.
422+
boto3Config = Config(
423+
retries = {
424+
'max_attempts': 5,
425+
'mode': 'adaptive'
426+
}
427+
)
399428

400429
if len(regions) == 0: # pylint: disable=E0601
401-
ec2Client = boto3.client('ec2')
430+
ec2Client = boto3.client('ec2', config=boto3Config)
402431
ec2Regions = ec2Client.describe_regions()['Regions']
403432
for region in ec2Regions:
404433
regions += [region['RegionName']]
@@ -407,8 +436,8 @@ def lambda_handler(event, context):
407436
for region in regions:
408437
if region in fsxRegions:
409438
print(f'Scanning {region}')
410-
fsx = boto3.client('fsx', region_name=region)
411-
cw = boto3.client('cloudwatch', region_name=region)
439+
fsx = boto3.client('fsx', region_name=region, config=boto3Config)
440+
cw = boto3.client('cloudwatch', region_name=region, config=boto3Config)
412441
#
413442
# Get all the file systems, volumes and alarm in the region.
414443
fss = getFss(fsx)
@@ -425,7 +454,8 @@ def lambda_handler(event, context):
425454
alarmName = alarmPrefixCPU + fsId
426455
alarmDescription = f"CPU utilization alarm for file system {fsName}{customerId} in region {region}."
427456

428-
if(not contains_alarm(alarmName, alarms)):
457+
if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or
458+
not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId):
429459
print(f'Adding CPU Alarm for {fs["FileSystemId"]}')
430460
add_cpu_alarm(cw, fsId, alarmName, alarmDescription, threshold, region)
431461
#
@@ -434,8 +464,9 @@ def lambda_handler(event, context):
434464
alarmName = alarm['AlarmName']
435465
if(alarmName[:len(alarmPrefixCPU)] == alarmPrefixCPU):
436466
fsId = alarmName[len(alarmPrefixCPU):]
437-
if(not contains_fs(fsId, fss)):
438-
print("Deleteing alarm: " + alarmName + " in region " + region)
467+
if(not contains_fs(fsId, fss) and onlyFilesystemId == None or
468+
not contains_fs(fsId, fss) and onlyFilesystemId != None and onlyFilesystemId == fsId):
469+
print("Deleting alarm: " + alarmName + " in region " + region)
439470
delete_alarm(cw, alarmName)
440471
#
441472
# Scan for filesystems without SSD Utilization Alarm.
@@ -448,7 +479,8 @@ def lambda_handler(event, context):
448479
alarmName = alarmPrefixSSD + fsId
449480
alarmDescription = f"SSD utilization alarm for file system {fsName}{customerId} in region {region}."
450481

451-
if(not contains_alarm(alarmName, alarms)):
482+
if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or
483+
not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId):
452484
print(f'Adding SSD Alarm for {fsId}')
453485
add_ssd_alarm(cw, fs['FileSystemId'], alarmName, alarmDescription, threshold, region)
454486
#
@@ -457,7 +489,8 @@ def lambda_handler(event, context):
457489
alarmName = alarm['AlarmName']
458490
if(alarmName[:len(alarmPrefixSSD)] == alarmPrefixSSD):
459491
fsId = alarmName[len(alarmPrefixSSD):]
460-
if(not contains_fs(fsId, fss)):
492+
if(not contains_fs(fsId, fss) and onlyFilesystemId == None or
493+
not contains_fs(fsId, fss) and onlyFilesystemId != None and onlyFilesystemId == fsId):
461494
print("Deleteing alarm: " + alarmName + " in region " + region)
462495
delete_alarm(cw, alarmName)
463496
#
@@ -475,7 +508,8 @@ def lambda_handler(event, context):
475508
alarmName = alarmPrefixVolume + volumeId
476509
fsName = fsId.replace('fs-', 'FsxId')
477510
alarmDescription = f"Volume utilization alarm for volumeId {volumeId}{customerId}, File System Name: {fsName}, Volume Name: {volumeName} in region {region}."
478-
if(not contains_alarm(alarmName, alarms)):
511+
if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or
512+
not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId):
479513
print(f'Adding volume utilization alarm for {volumeName} in region {region}.')
480514
add_volume_alarm(cw, volumeId, alarmName, alarmDescription, fsId, threshold, region)
481515
#
@@ -484,7 +518,8 @@ def lambda_handler(event, context):
484518
alarmName = alarm['AlarmName']
485519
if(alarmName[:len(alarmPrefixVolume)] == alarmPrefixVolume):
486520
volumeId = alarmName[len(alarmPrefixVolume):]
487-
if(not contains_volume(volumeId, volumes)):
521+
if(not contains_volume(volumeId, volumes) and onlyFilesystemId == None or
522+
not contains_volume(volumeId, volumes) and onlyFilesystemId != None and onlyFilesystemId == getFileSystemId(alarm)):
488523
print("Deleteing alarm: " + alarmName + " in region " + region)
489524
delete_alarm(cw, alarmName)
490525

@@ -494,7 +529,7 @@ def lambda_handler(event, context):
494529
# This function is used to print out the usage of the script.
495530
################################################################################
496531
def usage():
497-
print('Usage: add_cw_alarm [-h|--help] [-d|--dryRun] [[-c|--customerID] customerID] [[-a|--accountID] aws_account_id] [[-s|--SNSTopic] SNS_Topic_Name] [[-r|--region] region] [[-C|--CPUThreshold] threshold] [[-S|--SSDThreshold] threshold] [[-V|--VolumeThreshold] threshold]')
532+
print('Usage: add_cw_alarm [-h|--help] [-d|--dryRun] [[-c|--customerID customerID] [[-a|--accountID aws_account_id] [[-s|--SNSTopic SNS_Topic_Name] [[-r|--region region] [[-C|--CPUThreshold threshold] [[-S|--SSDThreshold threshold] [[-V|--VolumeThreshold threshold] [-F|--FileSystemID FileSystemID]')
498533

499534
################################################################################
500535
# Main logic starts here.
@@ -508,16 +543,17 @@ def usage():
508543
customerID = os.environ.get('customerId', '')
509544
accountId = os.environ.get('accountId', '')
510545
SNStopic = os.environ.get('SNStopic', '')
546+
onlyFilesystemId = None
511547
defaultCPUThreshold = int(os.environ.get('defaultCPUThreshold', defaultCPUThreshold))
512548
defaultSSDThreshold = int(os.environ.get('defaultSSDThreshold', defaultSSDThreshold))
513549
defaultVolumeThreshold = int(os.environ.get('defaultVolumeThreshold', defaultVolumeThreshold))
514550
#
515551
# Check to see if we are bring run from a command line or a Lmabda function.
516552
if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None:
517553
argumentList = sys.argv[1:]
518-
options = "hc:a:s:dr:C:S:V:"
554+
options = "hc:a:s:dr:C:S:V:F:"
519555

520-
longOptions = ["help", "customerID=", "accountID=", "SNSTopic=", "dryRun", "region=", "CPUThreshold=", "SSDThreshold=", "VolumeThreshold="]
556+
longOptions = ["help", "customerID=", "accountID=", "SNSTopic=", "dryRun", "region=", "CPUThreshold=", "SSDThreshold=", "VolumeThreshold=", "FileSystemID="]
521557
skip = False
522558
try:
523559
arguments, values = getopt.getopt(argumentList, options, longOptions)
@@ -542,6 +578,8 @@ def usage():
542578
dryRun = True
543579
elif currentArgument in ("-r", "--region"):
544580
regions += [currentValue]
581+
elif currentArgument in ("-F", "--FileSystemID"):
582+
onlyFilesystemId = currentValue
545583

546584
except getopt.error as err:
547585
print(str(err))

0 commit comments

Comments
 (0)