Skip to content

Commit c91d286

Browse files
committed
Added code to deal with AWS rating limitting it.
1 parent 87ce7cf commit c91d286

File tree

1 file changed

+138
-30
lines changed

1 file changed

+138
-30
lines changed

Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py

Lines changed: 138 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
#
2727
################################################################################
2828
#
29+
# The following variable effect the behavior of the script. They can be
30+
# either be set here, overridden via the command line options, or
31+
# overridden by environment variables.
32+
#
2933
# Define which SNS topic you want "volume full" message to be sent to.
3034
SNStopic=''
3135
#
@@ -51,6 +55,15 @@
5155
# Setting it to 100 will disable the creation of the alarm.
5256
defaultVolumeThreshold=80
5357
#
58+
#
59+
################################################################################
60+
# You can't change the following variables from the command line or environment
61+
# variables since changing them after the program has run once, would cause
62+
# all existing CloudWatch alarms to be abandoned, and all new alarms to be
63+
# created. So it is not recommended to change these variables unless you know
64+
# what you are doing.
65+
################################################################################
66+
#
5467
# Define the prefix for the volume utilization alarm name for the CloudWatch alarms.
5568
alarmPrefixVolume="Volume_Utilization_for_volume_"
5669
#
@@ -183,22 +196,34 @@ def contains_fs(fsId, fss):
183196
# threshold set above.
184197
################################################################################
185198
def getAlarmThresholdTagValue(fsx, arn):
199+
#
200+
# If there are a lot of volumes, we could get hit by the AWS rate limit,
201+
# so we will sleep for a short period of time and then retry. We will
202+
# double the sleep time each time we get a rate limit exception until
203+
# we get to 5 seconds, then we will just raise the exception.
204+
sleep=.125
186205
#
187206
# This is put into a try block because it is possible that the volume
188207
# is deleted between the time we get the list of volumes and the time
189208
# we try to get the tags for the volume.
190-
try:
191-
tags = fsx.list_tags_for_resource(ResourceARN=arn)
192-
for tag in tags['Tags']:
193-
if(tag['Key'].lower() == "alarm_threshold"):
194-
return(tag['Value'])
195-
except botocore.exceptions.ClientError as e:
196-
if e.response['Error']['Code'] == 'ResourceNotFound':
197-
return(100) # Return 100 so we don't try to create an alarm.
198-
else:
199-
raise e
200-
201-
return(defaultVolumeThreshold)
209+
while True:
210+
try:
211+
tags = fsx.list_tags_for_resource(ResourceARN=arn)
212+
for tag in tags['Tags']:
213+
if(tag['Key'].lower() == "alarm_threshold"):
214+
return(tag['Value'])
215+
return(defaultVolumeThreshold)
216+
except botocore.exceptions.ClientError as e:
217+
if e.response['Error']['Code'] == 'ResourceNotFound':
218+
return(100) # Return 100 so we don't try to create an alarm.
219+
elif e.response['Error']['Code'] == 'TooManyRequestsException':
220+
sleep = sleep * 2
221+
if sleep > 5:
222+
raise e
223+
print(f"Sleeping for {sleep} seconds.")
224+
time.sleep(sleep)
225+
else:
226+
raise e
202227

203228
################################################################################
204229
# This function returns the value assigned to the "CPU_alarm_threshold" tag
@@ -222,13 +247,102 @@ def getSSDAlarmThresholdTagValue(tags):
222247
return(tag['Value'])
223248
return(defaultSSDThreshold)
224249

250+
################################################################################
251+
# This function will return all the file systems in the region. It will handle the
252+
# case where there are more file systms than can be returned in a single call.
253+
# It will also handle the case where we get a rate limit exception.
254+
################################################################################
255+
def getFss(fsx):
256+
257+
# The initial amount of time to sleep if there is a rate limit exception.
258+
sleep=.125
259+
response = fsx.describe_file_systems()
260+
fss = response['FileSystems']
261+
nextToken = response.get('NextToken')
262+
while nextToken:
263+
try:
264+
response = fsx.describe_file_systems(NextToken=nextToken)
265+
fss += response['FileSystems']
266+
nextToken = response.get('NextToken')
267+
sleep=.125
268+
except botocore.exceptions.ClientError as e:
269+
if e.response['Error']['Code'] == 'TooManyRequestsException':
270+
sleep = sleep * 2 # Exponential backoff.
271+
if sleep > 5:
272+
raise e
273+
print(f"Sleeping for {sleep} seconds for file systems.")
274+
time.sleep(sleep)
275+
else:
276+
raise e
277+
return fss
278+
279+
################################################################################
280+
# This function will return all the volumes in the region. It will handle the
281+
# case where there are more volumes than can be returned in a single call.
282+
# It will also handle the case where we get a rate limit exception.
283+
################################################################################
284+
def getVolumes(fsx):
285+
286+
# The initial amount of time to sleep if there is a rate limit exception.
287+
sleep=.125
288+
response = fsx.describe_volumes()
289+
volumes = response['Volumes']
290+
nextToken = response.get('NextToken')
291+
while nextToken:
292+
try:
293+
response = fsx.describe_volumes(NextToken=nextToken)
294+
volumes += response['Volumes']
295+
nextToken = response.get('NextToken')
296+
sleep=.125
297+
except botocore.exceptions.ClientError as e:
298+
if e.response['Error']['Code'] == 'TooManyRequestsException':
299+
sleep = sleep * 2 # Exponential backoff.
300+
if sleep > 5:
301+
raise e
302+
print(f"Sleeping for {sleep} seconds for volumes.")
303+
time.sleep(sleep)
304+
else:
305+
raise e
306+
307+
return volumes
308+
309+
################################################################################
310+
# This function will return all the alarms in the region. It will handle the
311+
# case where there are more alarms than can be returned in a single call.
312+
# It will also handle the case where we get a rate limit exception.
313+
################################################################################
314+
def getAlarms(cw):
315+
316+
# The initial amount of time to sleep if there is a rate limit exception.
317+
sleep=.125
318+
response = cw.describe_alarms()
319+
alarms = response['MetricAlarms']
320+
nextToken = response.get('NextToken')
321+
while nextToken:
322+
try:
323+
response = cw.describe_alarms(NextToken=nextToken)
324+
alarms += response['MetricAlarms']
325+
nextToken = response.get('NextToken')
326+
sleep=.125
327+
except botocore.exceptions.ClientError as e:
328+
if e.response['Error']['Code'] == 'TooManyRequestsException':
329+
sleep = sleep * 2 # Exponential backoff.
330+
if sleep > 5:
331+
raise e
332+
print(f"Sleeping for {sleep} seconds for alarms.")
333+
time.sleep(sleep)
334+
else:
335+
raise e
336+
337+
return alarms
338+
225339
################################################################################
226340
# This is the main logic of the program. It loops on all the regions then all
227341
# the fsx volumes within the region, checking to see if any of them already
228342
# have a CloudWatch alarm, and if not, add one.
229343
################################################################################
230344
def lambda_handler(event, context):
231-
global customerId, regions
345+
global customerId, regions, SNStopic, accountId
232346
#
233347
# If the customer ID is set, reformat to be used in the alarm description.
234348
if customerId != '':
@@ -254,23 +368,9 @@ def lambda_handler(event, context):
254368
cw = boto3.client('cloudwatch', region_name=region)
255369
#
256370
# Get all the file systems, volumes and alarm in the region.
257-
response = fsx.describe_file_systems()
258-
fss = response['FileSystems']
259-
while response.get('NextToken'):
260-
response = fsx.describe_file_systems(NextToken=response['NextToken'])
261-
fss += response['FileSystems']
262-
263-
response = fsx.describe_volumes()
264-
volumes = response['Volumes']
265-
while response.get('NextToken'):
266-
response = fsx.describe_volumes(NextToken=response['NextToken'])
267-
volumes += response['Volumes']
268-
269-
response = cw.describe_alarms()
270-
alarms = response['MetricAlarms']
271-
while response.get('NextToken'):
272-
response = cw.describe_alarms(NextToken=response['NextToken'])
273-
alarms += response['MetricAlarms']
371+
fss = getFss(fsx)
372+
volumes = getVolumes(fsx)
373+
alarms = getAlarms(cw)
274374
#
275375
# Scan for filesystems without CPU Utilization Alarm.
276376
for fs in fss:
@@ -361,6 +461,14 @@ def usage():
361461
regions = []
362462
dryRun = False
363463
#
464+
# Check to see if there any any environment variables set.
465+
customerID = os.environ.get('customerId', '')
466+
accountId = os.environ.get('accountId', '')
467+
SNStopic = os.environ.get('SNStopic', '')
468+
defaultCPUThreshold = int(os.environ.get('defaultCPUThreshold', defaultCPUThreshold))
469+
defaultSSDThreshold = int(os.environ.get('defaultSSDThreshold', defaultSSDThreshold))
470+
defaultVolumeThreshold = int(os.environ.get('defaultVolumeThreshold', defaultVolumeThreshold))
471+
#
364472
# Check to see if we are bring run from a command line or a Lmabda function.
365473
if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None:
366474
argumentList = sys.argv[1:]

0 commit comments

Comments
 (0)