2626#
2727################################################################################
2828#
29+ # The following variable effect the behavior of the script. They can be
30+ # either be set here, overridden via the command line options, or
31+ # overridden by environment variables.
32+ #
2933# Define which SNS topic you want "volume full" message to be sent to.
3034SNStopic = ''
3135#
5155# Setting it to 100 will disable the creation of the alarm.
5256defaultVolumeThreshold = 80
5357#
58+ #
59+ ################################################################################
60+ # You can't change the following variables from the command line or environment
61+ # variables since changing them after the program has run once, would cause
62+ # all existing CloudWatch alarms to be abandoned, and all new alarms to be
63+ # created. So it is not recommended to change these variables unless you know
64+ # what you are doing.
65+ ################################################################################
66+ #
5467# Define the prefix for the volume utilization alarm name for the CloudWatch alarms.
5568alarmPrefixVolume = "Volume_Utilization_for_volume_"
5669#
@@ -183,22 +196,34 @@ def contains_fs(fsId, fss):
183196# threshold set above.
184197################################################################################
185198def getAlarmThresholdTagValue (fsx , arn ):
199+ #
200+ # If there are a lot of volumes, we could get hit by the AWS rate limit,
201+ # so we will sleep for a short period of time and then retry. We will
202+ # double the sleep time each time we get a rate limit exception until
203+ # we get to 5 seconds, then we will just raise the exception.
204+ sleep = .125
186205 #
187206 # This is put into a try block because it is possible that the volume
188207 # is deleted between the time we get the list of volumes and the time
189208 # we try to get the tags for the volume.
190- try :
191- tags = fsx .list_tags_for_resource (ResourceARN = arn )
192- for tag in tags ['Tags' ]:
193- if (tag ['Key' ].lower () == "alarm_threshold" ):
194- return (tag ['Value' ])
195- except botocore .exceptions .ClientError as e :
196- if e .response ['Error' ]['Code' ] == 'ResourceNotFound' :
197- return (100 ) # Return 100 so we don't try to create an alarm.
198- else :
199- raise e
200-
201- return (defaultVolumeThreshold )
209+ while True :
210+ try :
211+ tags = fsx .list_tags_for_resource (ResourceARN = arn )
212+ for tag in tags ['Tags' ]:
213+ if (tag ['Key' ].lower () == "alarm_threshold" ):
214+ return (tag ['Value' ])
215+ return (defaultVolumeThreshold )
216+ except botocore .exceptions .ClientError as e :
217+ if e .response ['Error' ]['Code' ] == 'ResourceNotFound' :
218+ return (100 ) # Return 100 so we don't try to create an alarm.
219+ elif e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
220+ sleep = sleep * 2
221+ if sleep > 5 :
222+ raise e
223+ print (f"Sleeping for { sleep } seconds." )
224+ time .sleep (sleep )
225+ else :
226+ raise e
202227
203228################################################################################
204229# This function returns the value assigned to the "CPU_alarm_threshold" tag
@@ -222,13 +247,102 @@ def getSSDAlarmThresholdTagValue(tags):
222247 return (tag ['Value' ])
223248 return (defaultSSDThreshold )
224249
250+ ################################################################################
251+ # This function will return all the file systems in the region. It will handle the
252+ # case where there are more file systms than can be returned in a single call.
253+ # It will also handle the case where we get a rate limit exception.
254+ ################################################################################
255+ def getFss (fsx ):
256+
257+ # The initial amount of time to sleep if there is a rate limit exception.
258+ sleep = .125
259+ response = fsx .describe_file_systems ()
260+ fss = response ['FileSystems' ]
261+ nextToken = response .get ('NextToken' )
262+ while nextToken :
263+ try :
264+ response = fsx .describe_file_systems (NextToken = nextToken )
265+ fss += response ['FileSystems' ]
266+ nextToken = response .get ('NextToken' )
267+ sleep = .125
268+ except botocore .exceptions .ClientError as e :
269+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
270+ sleep = sleep * 2 # Exponential backoff.
271+ if sleep > 5 :
272+ raise e
273+ print (f"Sleeping for { sleep } seconds for file systems." )
274+ time .sleep (sleep )
275+ else :
276+ raise e
277+ return fss
278+
279+ ################################################################################
280+ # This function will return all the volumes in the region. It will handle the
281+ # case where there are more volumes than can be returned in a single call.
282+ # It will also handle the case where we get a rate limit exception.
283+ ################################################################################
284+ def getVolumes (fsx ):
285+
286+ # The initial amount of time to sleep if there is a rate limit exception.
287+ sleep = .125
288+ response = fsx .describe_volumes ()
289+ volumes = response ['Volumes' ]
290+ nextToken = response .get ('NextToken' )
291+ while nextToken :
292+ try :
293+ response = fsx .describe_volumes (NextToken = nextToken )
294+ volumes += response ['Volumes' ]
295+ nextToken = response .get ('NextToken' )
296+ sleep = .125
297+ except botocore .exceptions .ClientError as e :
298+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
299+ sleep = sleep * 2 # Exponential backoff.
300+ if sleep > 5 :
301+ raise e
302+ print (f"Sleeping for { sleep } seconds for volumes." )
303+ time .sleep (sleep )
304+ else :
305+ raise e
306+
307+ return volumes
308+
309+ ################################################################################
310+ # This function will return all the alarms in the region. It will handle the
311+ # case where there are more alarms than can be returned in a single call.
312+ # It will also handle the case where we get a rate limit exception.
313+ ################################################################################
314+ def getAlarms (cw ):
315+
316+ # The initial amount of time to sleep if there is a rate limit exception.
317+ sleep = .125
318+ response = cw .describe_alarms ()
319+ alarms = response ['MetricAlarms' ]
320+ nextToken = response .get ('NextToken' )
321+ while nextToken :
322+ try :
323+ response = cw .describe_alarms (NextToken = nextToken )
324+ alarms += response ['MetricAlarms' ]
325+ nextToken = response .get ('NextToken' )
326+ sleep = .125
327+ except botocore .exceptions .ClientError as e :
328+ if e .response ['Error' ]['Code' ] == 'TooManyRequestsException' :
329+ sleep = sleep * 2 # Exponential backoff.
330+ if sleep > 5 :
331+ raise e
332+ print (f"Sleeping for { sleep } seconds for alarms." )
333+ time .sleep (sleep )
334+ else :
335+ raise e
336+
337+ return alarms
338+
225339################################################################################
226340# This is the main logic of the program. It loops on all the regions then all
227341# the fsx volumes within the region, checking to see if any of them already
228342# have a CloudWatch alarm, and if not, add one.
229343################################################################################
230344def lambda_handler (event , context ):
231- global customerId , regions
345+ global customerId , regions , SNStopic , accountId
232346 #
233347 # If the customer ID is set, reformat to be used in the alarm description.
234348 if customerId != '' :
@@ -254,23 +368,9 @@ def lambda_handler(event, context):
254368 cw = boto3 .client ('cloudwatch' , region_name = region )
255369 #
256370 # Get all the file systems, volumes and alarm in the region.
257- response = fsx .describe_file_systems ()
258- fss = response ['FileSystems' ]
259- while response .get ('NextToken' ):
260- response = fsx .describe_file_systems (NextToken = response ['NextToken' ])
261- fss += response ['FileSystems' ]
262-
263- response = fsx .describe_volumes ()
264- volumes = response ['Volumes' ]
265- while response .get ('NextToken' ):
266- response = fsx .describe_volumes (NextToken = response ['NextToken' ])
267- volumes += response ['Volumes' ]
268-
269- response = cw .describe_alarms ()
270- alarms = response ['MetricAlarms' ]
271- while response .get ('NextToken' ):
272- response = cw .describe_alarms (NextToken = response ['NextToken' ])
273- alarms += response ['MetricAlarms' ]
371+ fss = getFss (fsx )
372+ volumes = getVolumes (fsx )
373+ alarms = getAlarms (cw )
274374 #
275375 # Scan for filesystems without CPU Utilization Alarm.
276376 for fs in fss :
@@ -361,6 +461,14 @@ def usage():
361461regions = []
362462dryRun = False
363463#
464+ # Check to see if there any any environment variables set.
465+ customerID = os .environ .get ('customerId' , '' )
466+ accountId = os .environ .get ('accountId' , '' )
467+ SNStopic = os .environ .get ('SNStopic' , '' )
468+ defaultCPUThreshold = int (os .environ .get ('defaultCPUThreshold' , defaultCPUThreshold ))
469+ defaultSSDThreshold = int (os .environ .get ('defaultSSDThreshold' , defaultSSDThreshold ))
470+ defaultVolumeThreshold = int (os .environ .get ('defaultVolumeThreshold' , defaultVolumeThreshold ))
471+ #
364472# Check to see if we are bring run from a command line or a Lmabda function.
365473if os .environ .get ('AWS_LAMBDA_FUNCTION_NAME' ) == None :
366474 argumentList = sys .argv [1 :]
0 commit comments