Added code to deal with AWS rating limitting it.

kcantrel · kcantrel · commit c91d2867a083 · 2024-08-09T17:43:04.000-05:00
diff --git a/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py b/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py
@@ -26,6 +26,10 @@
 #
 ################################################################################
 #
+# The following variable effect the behavior of the script. They can be
+# either be set here, overridden via the command line options, or
+# overridden by environment variables.
+#
 # Define which SNS topic you want "volume full" message to be sent to.
 SNStopic=''
 #
@@ -51,6 +55,15 @@
 # Setting it to 100 will disable the creation of the alarm.
 defaultVolumeThreshold=80
 #
+#
+################################################################################
+# You can't change the following variables from the command line or environment
+# variables since changing them after the program has run once, would cause
+# all existing CloudWatch alarms to be abandoned, and all new alarms to be
+# created. So it is not recommended to change these variables unless you know
+# what you are doing.
+################################################################################
+#
 # Define the prefix for the volume utilization alarm name for the CloudWatch alarms.
 alarmPrefixVolume="Volume_Utilization_for_volume_"
 #
@@ -183,22 +196,34 @@ def contains_fs(fsId, fss):
 # threshold set above.
 ################################################################################
 def getAlarmThresholdTagValue(fsx, arn):
+    #
+    # If there are a lot of volumes, we could get hit by the AWS rate limit,
+    # so we will sleep for a short period of time and then retry. We will
+    # double the sleep time each time we get a rate limit exception until
+    # we get to 5 seconds, then we will just raise the exception.
+    sleep=.125
     #
     # This is put into a try block because it is possible that the volume
     # is deleted between the time we get the list of volumes and the time
     # we try to get the tags for the volume.
-    try:
-        tags = fsx.list_tags_for_resource(ResourceARN=arn)
-        for tag in tags['Tags']:
-            if(tag['Key'].lower() == "alarm_threshold"):
-                return(tag['Value'])
-    except botocore.exceptions.ClientError as e:
-        if e.response['Error']['Code'] == 'ResourceNotFound':
-            return(100) # Return 100 so we don't try to create an alarm.
-        else:
-            raise e
-
-    return(defaultVolumeThreshold)
+    while True:
+        try:
+            tags = fsx.list_tags_for_resource(ResourceARN=arn)
+            for tag in tags['Tags']:
+                if(tag['Key'].lower() == "alarm_threshold"):
+                    return(tag['Value'])
+            return(defaultVolumeThreshold)
+        except botocore.exceptions.ClientError as e:
+            if e.response['Error']['Code'] == 'ResourceNotFound':
+                return(100) # Return 100 so we don't try to create an alarm.
+            elif e.response['Error']['Code'] == 'TooManyRequestsException':
+                sleep = sleep * 2
+                if sleep > 5:
+                    raise e
+                print(f"Sleeping for {sleep} seconds.")
+                time.sleep(sleep)
+            else:
+                raise e
 
 ################################################################################
 # This function returns the value assigned to the "CPU_alarm_threshold" tag
@@ -222,13 +247,102 @@ def getSSDAlarmThresholdTagValue(tags):
             return(tag['Value'])
     return(defaultSSDThreshold)
 
+################################################################################
+# This function will return all the file systems in the region. It will handle the
+# case where there are more file systms than can be returned in a single call.
+# It will also handle the case where we get a rate limit exception.
+################################################################################
+def getFss(fsx):
+
+    # The initial amount of time to sleep if there is a rate limit exception.
+    sleep=.125
+    response = fsx.describe_file_systems()
+    fss = response['FileSystems']
+    nextToken = response.get('NextToken')
+    while nextToken:
+        try:
+            response = fsx.describe_file_systems(NextToken=nextToken)
+            fss += response['FileSystems']
+            nextToken = response.get('NextToken')
+            sleep=.125
+        except botocore.exceptions.ClientError as e:
+            if e.response['Error']['Code'] == 'TooManyRequestsException':
+                sleep = sleep * 2   # Exponential backoff.
+                if sleep > 5:
+                    raise e
+                print(f"Sleeping for {sleep} seconds for file systems.")
+                time.sleep(sleep)
+            else:
+                raise e
+    return fss
+
+################################################################################
+# This function will return all the volumes in the region. It will handle the
+# case where there are more volumes than can be returned in a single call.
+# It will also handle the case where we get a rate limit exception.
+################################################################################
+def getVolumes(fsx):
+
+    # The initial amount of time to sleep if there is a rate limit exception.
+    sleep=.125
+    response = fsx.describe_volumes()
+    volumes = response['Volumes']
+    nextToken = response.get('NextToken')
+    while nextToken:
+        try:
+            response = fsx.describe_volumes(NextToken=nextToken)
+            volumes += response['Volumes']
+            nextToken = response.get('NextToken')
+            sleep=.125
+        except botocore.exceptions.ClientError as e:
+            if e.response['Error']['Code'] == 'TooManyRequestsException':
+                sleep = sleep * 2   # Exponential backoff.
+                if sleep > 5:
+                    raise e
+                print(f"Sleeping for {sleep} seconds for volumes.")
+                time.sleep(sleep)
+            else:
+                raise e
+
+    return volumes
+
+################################################################################
+# This function will return all the alarms in the region. It will handle the
+# case where there are more alarms than can be returned in a single call.
+# It will also handle the case where we get a rate limit exception.
+################################################################################
+def getAlarms(cw):
+
+    # The initial amount of time to sleep if there is a rate limit exception.
+    sleep=.125
+    response = cw.describe_alarms()
+    alarms = response['MetricAlarms']
+    nextToken = response.get('NextToken')
+    while nextToken:
+        try:
+            response = cw.describe_alarms(NextToken=nextToken)
+            alarms += response['MetricAlarms']
+            nextToken = response.get('NextToken')
+            sleep=.125
+        except botocore.exceptions.ClientError as e:
+            if e.response['Error']['Code'] == 'TooManyRequestsException':
+                sleep = sleep * 2   # Exponential backoff.
+                if sleep > 5:
+                    raise e
+                print(f"Sleeping for {sleep} seconds for alarms.")
+                time.sleep(sleep)
+            else:
+                raise e
+
+    return alarms
+
 ################################################################################
 # This is the main logic of the program. It loops on all the regions then all
 # the fsx volumes within the region, checking to see if any of them already
 # have a CloudWatch alarm, and if not, add one.
 ################################################################################
 def lambda_handler(event, context):
-    global customerId, regions
+    global customerId, regions, SNStopic, accountId
     #
     # If the customer ID is set, reformat to be used in the alarm description.
     if customerId != '':
@@ -254,23 +368,9 @@ def lambda_handler(event, context):
             cw = boto3.client('cloudwatch', region_name=region)
             #
             # Get all the file systems, volumes and alarm in the region.
-            response = fsx.describe_file_systems()
-            fss = response['FileSystems']
-            while response.get('NextToken'):
-                response = fsx.describe_file_systems(NextToken=response['NextToken'])
-                fss += response['FileSystems']
-
-            response = fsx.describe_volumes()
-            volumes = response['Volumes']
-            while response.get('NextToken'):
-                response = fsx.describe_volumes(NextToken=response['NextToken'])
-                volumes += response['Volumes']
-
-            response = cw.describe_alarms()
-            alarms = response['MetricAlarms']
-            while response.get('NextToken'):
-                response = cw.describe_alarms(NextToken=response['NextToken'])
-                alarms += response['MetricAlarms']
+            fss     = getFss(fsx)
+            volumes = getVolumes(fsx)
+            alarms  = getAlarms(cw)
             #
             # Scan for filesystems without CPU Utilization Alarm.
             for fs in fss:
@@ -361,6 +461,14 @@ def usage():
 regions = []
 dryRun = False
 #
+# Check to see if there any any environment variables set.
+customerID = os.environ.get('customerId', '')
+accountId  = os.environ.get('accountId', '')
+SNStopic   = os.environ.get('SNStopic', '')
+defaultCPUThreshold    = int(os.environ.get('defaultCPUThreshold',    defaultCPUThreshold))
+defaultSSDThreshold    = int(os.environ.get('defaultSSDThreshold',    defaultSSDThreshold))
+defaultVolumeThreshold = int(os.environ.get('defaultVolumeThreshold', defaultVolumeThreshold))
+#
 # Check to see if we are bring run from a command line or a Lmabda function.
 if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None:
     argumentList = sys.argv[1:]