2626#
2727################################################################################
2828#
29- # The following variable effect the behavior of the script. They can be
29+ # The following variables effect the behavior of the script. They can be
3030# either be set here, overridden via the command line options, or
3131# overridden by environment variables.
3232#
5858#
5959################################################################################
6060# You can't change the following variables from the command line or environment
61- # variables since changing them after the program has run once, would cause
61+ # variables since changing them after the program has run once would cause
6262# all existing CloudWatch alarms to be abandoned, and all new alarms to be
63- # created. So it is not recommended to change these variables unless you know
63+ # created. So, it is not recommended to change these variables unless you know
6464# what you are doing.
6565################################################################################
6666#
7878################################################################################
7979
8080import botocore
81+ from botocore .config import Config
8182import boto3
8283import os
8384import getopt
@@ -217,13 +218,14 @@ def getAlarmThresholdTagValue(fsx, arn):
217218 except botocore .exceptions .ClientError as e :
218219 if e .response ['Error' ]['Code' ] == 'ResourceNotFound' :
219220 return (100 ) # Return 100 so we don't try to create an alarm.
220- elif e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
221+ elif e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
221222 sleep = sleep * 2
222223 if sleep > 5 :
223224 raise e
224- print (f"Sleeping for { sleep } seconds." )
225+ print (f"Warning: Rate Limit fault while getting tags. Sleeping for { sleep } seconds." )
225226 time .sleep (sleep )
226227 else :
228+ print (f"boto3 client error: { json .dumps (e .response )} " )
227229 raise e
228230
229231################################################################################
@@ -248,6 +250,19 @@ def getSSDAlarmThresholdTagValue(tags):
248250 return (tag ['Value' ])
249251 return (defaultSSDThreshold )
250252
253+ ################################################################################
254+ # This function returns the file system id that the passed in alarm is
255+ # associated with.
256+ ################################################################################
257+ def getFileSystemId (alarm ):
258+
259+ for metric in alarm ['Metrics' ]:
260+ if metric ["Id" ] == "m1" :
261+ for dim in metric ['MetricStat' ]['Metric' ]['Dimensions' ]:
262+ if dim ['Name' ] == 'FileSystemId' :
263+ return dim ['Value' ]
264+ return None
265+
251266################################################################################
252267# This function will return all the file systems in the region. It will handle the
253268# case where there are more file systms than can be returned in a single call.
@@ -265,13 +280,14 @@ def getFss(fsx):
265280 sleep = .125
266281 break
267282 except botocore .exceptions .ClientError as e :
268- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
283+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
269284 sleep = sleep * 2 # Exponential backoff.
270285 if sleep > 5 :
271286 raise e
272- print (f"Sleeping for { sleep } seconds for initial file systems ." )
287+ print (f"Warning: Rate Limit fault while getting initial file system list. Sleeping for { sleep } seconds ." )
273288 time .sleep (sleep )
274289 else :
290+ print (f"boto3 client error: { json .dumps (e .response )} " )
275291 raise e
276292
277293 while nextToken :
@@ -281,13 +297,14 @@ def getFss(fsx):
281297 nextToken = response .get ('NextToken' )
282298 sleep = .125
283299 except botocore .exceptions .ClientError as e :
284- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
300+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
285301 sleep = sleep * 2 # Exponential backoff.
286302 if sleep > 5 :
287303 raise e
288- print (f"Sleeping for { sleep } seconds for additional file systems." )
304+ print (f"Warning: Rate Limit fault while getting additional file systems. Sleeping for { sleep } seconds ." )
289305 time .sleep (sleep )
290306 else :
307+ print (f"boto3 client error: { json .dumps (e .response )} " )
291308 raise e
292309 return fss
293310
@@ -297,7 +314,7 @@ def getFss(fsx):
297314# It will also handle the case where we get a rate limit exception.
298315################################################################################
299316def getVolumes (fsx ):
300-
317+ #
301318 # The initial amount of time to sleep if there is a rate limit exception.
302319 sleep = .125
303320 while True :
@@ -308,13 +325,14 @@ def getVolumes(fsx):
308325 sleep = .125
309326 break
310327 except botocore .exceptions .ClientError as e :
311- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
328+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
312329 sleep = sleep * 2 # Exponential backoff.
313330 if sleep > 5 :
314331 raise e
315- print (f"Sleeping for { sleep } seconds for initial volumes ." )
332+ print (f"Warning: Rate Limit fault while getting the initial list of volumes. Sleeping for { sleep } seconds." )
316333 time .sleep (sleep )
317334 else :
335+ print (f"boto3 client error: { json .dumps (e .response )} " )
318336 raise e
319337
320338 while nextToken :
@@ -324,13 +342,14 @@ def getVolumes(fsx):
324342 nextToken = response .get ('NextToken' )
325343 sleep = .125
326344 except botocore .exceptions .ClientError as e :
327- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
345+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
328346 sleep = sleep * 2 # Exponential backoff.
329347 if sleep > 5 :
330348 raise e
331- print (f"Sleeping for { sleep } seconds for additional volumes ." )
349+ print (f"Warning: Rate Limit fault while getting additional volumes. Sleeping for { sleep } seconds." )
332350 time .sleep (sleep )
333351 else :
352+ print (f"boto3 client error: { json .dumps (e .response )} " )
334353 raise e
335354
336355 return volumes
@@ -352,13 +371,14 @@ def getAlarms(cw):
352371 sleep = .125
353372 break
354373 except botocore .exceptions .ClientError as e :
355- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
374+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
356375 sleep = sleep * 2
357376 if sleep > 5 :
358377 raise e
359- print (f"Sleeping for { sleep } seconds for initial alarms ." )
378+ print (f"Warning: Rate Limit fault while getting the initial list of alarms. Sleeping for { sleep } seconds." )
360379 time .sleep (sleep )
361380 else :
381+ print (f"boto3 client error: { json .dumps (e .response )} " )
362382 raise e
363383
364384 while nextToken :
@@ -368,13 +388,14 @@ def getAlarms(cw):
368388 nextToken = response .get ('NextToken' )
369389 sleep = .125
370390 except botocore .exceptions .ClientError as e :
371- if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
391+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' or e . response [ 'Error' ][ 'Code' ] == 'ThrottlingException' :
372392 sleep = sleep * 2 # Exponential backoff.
373393 if sleep > 5 :
374394 raise e
375- print (f"Sleeping for { sleep } seconds for additional alarms ." )
395+ print (f"Warning: Rate Limit fault while getting additional alarms. Sleeping for { sleep } seconds." )
376396 time .sleep (sleep )
377397 else :
398+ print (f"boto3 client error: { json .dumps (e .response )} " )
378399 raise e
379400
380401 return alarms
@@ -385,9 +406,9 @@ def getAlarms(cw):
385406# have a CloudWatch alarm, and if not, add one.
386407################################################################################
387408def lambda_handler (event , context ):
388- global customerId , regions , SNStopic , accountId
409+ global customerId , regions , SNStopic , accountId , onlyFilesystemId
389410 #
390- # If the customer ID is set, reformat to be used in the alarm description.
411+ # If the customer ID is set, reformat it to be used in the alarm description.
391412 if customerId != '' :
392413 customerId = f", CustomerID: { customerId } "
393414
@@ -396,9 +417,17 @@ def lambda_handler(event, context):
396417
397418 if len (accountId ) == 0 :
398419 raise Exception ("You must specify an accountId to run this program." )
420+ #
421+ # Configure boto3 to use the more advanced "adaptive" retry method.
422+ boto3Config = Config (
423+ retries = {
424+ 'max_attempts' : 5 ,
425+ 'mode' : 'adaptive'
426+ }
427+ )
399428
400429 if len (regions ) == 0 : # pylint: disable=E0601
401- ec2Client = boto3 .client ('ec2' )
430+ ec2Client = boto3 .client ('ec2' , config = boto3Config )
402431 ec2Regions = ec2Client .describe_regions ()['Regions' ]
403432 for region in ec2Regions :
404433 regions += [region ['RegionName' ]]
@@ -407,8 +436,8 @@ def lambda_handler(event, context):
407436 for region in regions :
408437 if region in fsxRegions :
409438 print (f'Scanning { region } ' )
410- fsx = boto3 .client ('fsx' , region_name = region )
411- cw = boto3 .client ('cloudwatch' , region_name = region )
439+ fsx = boto3 .client ('fsx' , region_name = region , config = boto3Config )
440+ cw = boto3 .client ('cloudwatch' , region_name = region , config = boto3Config )
412441 #
413442 # Get all the file systems, volumes and alarm in the region.
414443 fss = getFss (fsx )
@@ -425,7 +454,8 @@ def lambda_handler(event, context):
425454 alarmName = alarmPrefixCPU + fsId
426455 alarmDescription = f"CPU utilization alarm for file system { fsName } { customerId } in region { region } ."
427456
428- if (not contains_alarm (alarmName , alarms )):
457+ if (not contains_alarm (alarmName , alarms ) and onlyFilesystemId == None or
458+ not contains_alarm (alarmName , alarms ) and onlyFilesystemId != None and onlyFilesystemId == fsId ):
429459 print (f'Adding CPU Alarm for { fs ["FileSystemId" ]} ' )
430460 add_cpu_alarm (cw , fsId , alarmName , alarmDescription , threshold , region )
431461 #
@@ -434,8 +464,9 @@ def lambda_handler(event, context):
434464 alarmName = alarm ['AlarmName' ]
435465 if (alarmName [:len (alarmPrefixCPU )] == alarmPrefixCPU ):
436466 fsId = alarmName [len (alarmPrefixCPU ):]
437- if (not contains_fs (fsId , fss )):
438- print ("Deleteing alarm: " + alarmName + " in region " + region )
467+ if (not contains_fs (fsId , fss ) and onlyFilesystemId == None or
468+ not contains_fs (fsId , fss ) and onlyFilesystemId != None and onlyFilesystemId == fsId ):
469+ print ("Deleting alarm: " + alarmName + " in region " + region )
439470 delete_alarm (cw , alarmName )
440471 #
441472 # Scan for filesystems without SSD Utilization Alarm.
@@ -448,7 +479,8 @@ def lambda_handler(event, context):
448479 alarmName = alarmPrefixSSD + fsId
449480 alarmDescription = f"SSD utilization alarm for file system { fsName } { customerId } in region { region } ."
450481
451- if (not contains_alarm (alarmName , alarms )):
482+ if (not contains_alarm (alarmName , alarms ) and onlyFilesystemId == None or
483+ not contains_alarm (alarmName , alarms ) and onlyFilesystemId != None and onlyFilesystemId == fsId ):
452484 print (f'Adding SSD Alarm for { fsId } ' )
453485 add_ssd_alarm (cw , fs ['FileSystemId' ], alarmName , alarmDescription , threshold , region )
454486 #
@@ -457,7 +489,8 @@ def lambda_handler(event, context):
457489 alarmName = alarm ['AlarmName' ]
458490 if (alarmName [:len (alarmPrefixSSD )] == alarmPrefixSSD ):
459491 fsId = alarmName [len (alarmPrefixSSD ):]
460- if (not contains_fs (fsId , fss )):
492+ if (not contains_fs (fsId , fss ) and onlyFilesystemId == None or
493+ not contains_fs (fsId , fss ) and onlyFilesystemId != None and onlyFilesystemId == fsId ):
461494 print ("Deleteing alarm: " + alarmName + " in region " + region )
462495 delete_alarm (cw , alarmName )
463496 #
@@ -475,7 +508,8 @@ def lambda_handler(event, context):
475508 alarmName = alarmPrefixVolume + volumeId
476509 fsName = fsId .replace ('fs-' , 'FsxId' )
477510 alarmDescription = f"Volume utilization alarm for volumeId { volumeId } { customerId } , File System Name: { fsName } , Volume Name: { volumeName } in region { region } ."
478- if (not contains_alarm (alarmName , alarms )):
511+ if (not contains_alarm (alarmName , alarms ) and onlyFilesystemId == None or
512+ not contains_alarm (alarmName , alarms ) and onlyFilesystemId != None and onlyFilesystemId == fsId ):
479513 print (f'Adding volume utilization alarm for { volumeName } in region { region } .' )
480514 add_volume_alarm (cw , volumeId , alarmName , alarmDescription , fsId , threshold , region )
481515 #
@@ -484,7 +518,8 @@ def lambda_handler(event, context):
484518 alarmName = alarm ['AlarmName' ]
485519 if (alarmName [:len (alarmPrefixVolume )] == alarmPrefixVolume ):
486520 volumeId = alarmName [len (alarmPrefixVolume ):]
487- if (not contains_volume (volumeId , volumes )):
521+ if (not contains_volume (volumeId , volumes ) and onlyFilesystemId == None or
522+ not contains_volume (volumeId , volumes ) and onlyFilesystemId != None and onlyFilesystemId == getFileSystemId (alarm )):
488523 print ("Deleteing alarm: " + alarmName + " in region " + region )
489524 delete_alarm (cw , alarmName )
490525
@@ -494,7 +529,7 @@ def lambda_handler(event, context):
494529# This function is used to print out the usage of the script.
495530################################################################################
496531def usage ():
497- print ('Usage: add_cw_alarm [-h|--help] [-d|--dryRun] [[-c|--customerID] customerID] [[-a|--accountID] aws_account_id] [[-s|--SNSTopic] SNS_Topic_Name] [[-r|--region] region] [[-C|--CPUThreshold] threshold] [[-S|--SSDThreshold] threshold] [[-V|--VolumeThreshold] threshold]' )
532+ print ('Usage: add_cw_alarm [-h|--help] [-d|--dryRun] [[-c|--customerID customerID] [[-a|--accountID aws_account_id] [[-s|--SNSTopic SNS_Topic_Name] [[-r|--region region] [[-C|--CPUThreshold threshold] [[-S|--SSDThreshold threshold] [[-V|--VolumeThreshold threshold] [-F|--FileSystemID FileSystemID ]' )
498533
499534################################################################################
500535# Main logic starts here.
@@ -508,16 +543,17 @@ def usage():
508543customerID = os .environ .get ('customerId' , '' )
509544accountId = os .environ .get ('accountId' , '' )
510545SNStopic = os .environ .get ('SNStopic' , '' )
546+ onlyFilesystemId = None
511547defaultCPUThreshold = int (os .environ .get ('defaultCPUThreshold' , defaultCPUThreshold ))
512548defaultSSDThreshold = int (os .environ .get ('defaultSSDThreshold' , defaultSSDThreshold ))
513549defaultVolumeThreshold = int (os .environ .get ('defaultVolumeThreshold' , defaultVolumeThreshold ))
514550#
515551# Check to see if we are bring run from a command line or a Lmabda function.
516552if os .environ .get ('AWS_LAMBDA_FUNCTION_NAME' ) == None :
517553 argumentList = sys .argv [1 :]
518- options = "hc:a:s:dr:C:S:V:"
554+ options = "hc:a:s:dr:C:S:V:F: "
519555
520- longOptions = ["help" , "customerID=" , "accountID=" , "SNSTopic=" , "dryRun" , "region=" , "CPUThreshold=" , "SSDThreshold=" , "VolumeThreshold=" ]
556+ longOptions = ["help" , "customerID=" , "accountID=" , "SNSTopic=" , "dryRun" , "region=" , "CPUThreshold=" , "SSDThreshold=" , "VolumeThreshold=" , "FileSystemID=" ]
521557 skip = False
522558 try :
523559 arguments , values = getopt .getopt (argumentList , options , longOptions )
@@ -542,6 +578,8 @@ def usage():
542578 dryRun = True
543579 elif currentArgument in ("-r" , "--region" ):
544580 regions += [currentValue ]
581+ elif currentArgument in ("-F" , "--FileSystemID" ):
582+ onlyFilesystemId = currentValue
545583
546584 except getopt .error as err :
547585 print (str (err ))
0 commit comments