Fix lambda data (#167)

iakov-aws · web-flow · commit 432de10775aa · 2024-07-21T08:19:07.000+02:00
* fix lambda inventory as Environment breaks crawler

* rename lambda to lambda_functions

* rename lambda to lambda_functions

* lint and refactor
diff --git a/data-collection/deploy/module-budgets.yaml b/data-collection/deploy/module-budgets.yaml
@@ -98,39 +98,41 @@ Resources:
     Properties:
       FunctionName: !Sub '${ResourcePrefix}${CFDataName}-Lambda'
       Description: !Sub "Lambda function to retrieve ${CFDataName}"
-      Runtime: python3.10
+      Runtime: python3.12
       Architectures: [x86_64]
       Code:
         ZipFile: |
-          #Author Stephanie Gooch 2021
-          #Mohideen - Added Budgets tag collection module
+          #Authors:
+          # Stephanie Gooch - initial version
+          # Mohideen - Added Budgets tag collection module
           import os
           import json
           import logging
           import datetime
           from json import JSONEncoder
           import sys
+
+          # update boto3 for list_tags_for_resource api
           from pip._internal import main
           main(['install', '-I', '-q', 'boto3', '--target', '/tmp/', '--no-cache-dir', '--disable-pip-version-check'])
           sys.path.insert(0,'/tmp/')
 
-          import boto3
+          import boto3 #pylint: disable=C0413
 
           BUCKET = os.environ["BUCKET_NAME"]
           PREFIX = os.environ["PREFIX"]
-          ROLE_NAME = os.environ['ROLENAME']
+          ROLE_NAME = os.environ['ROLE_NAME']
           TMP_FILE = "/tmp/data.json"
-          REGIONS = ["us-east-1"]
 
           logger = logging.getLogger(__name__)
           logger.setLevel(getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO))
 
-          # subclass JSONEncoder
           class DateTimeEncoder(JSONEncoder):
-              # Override the default method
-              def default(self, obj):
-                  if isinstance(obj, (datetime.date, datetime.datetime)):
-                      return obj.isoformat()
+              """encoder for json with time object"""
+              def default(self, o):
+                  if isinstance(o, (datetime.date, datetime.datetime)):
+                      return o.isoformat()
+                  return None
 
           def assume_role(account_id, service, region):
               cred = boto3.client('sts', region_name=region).assume_role(
@@ -144,63 +146,54 @@ Resources:
                   aws_session_token=cred['SessionToken']
               )
 
-          def lambda_handler(event, context):
+          def lambda_handler(event, context): #pylint: disable=W0613
               logger.info(f"Event data {json.dumps(event)}")
               if 'account' not in event:
                   raise ValueError(
                       "Please do not trigger this Lambda manually."
                       "Find the corresponding state machine in Step Functions and Trigger from there."
                   )
               collection_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-              try:
-                  account = json.loads(event["account"])
-                  account_id = account["account_id"]
-                  account_name = account["account_name"]
-                  payer_id = account["payer_id"]
-                  logger.info(f"Collecting data for account: {account_id}")
-                  budgets_client = assume_role(account_id, "budgets", REGIONS[0])
-                  paginator = budgets_client.get_paginator("describe_budgets") #Paginator for a large list of accounts
-                  response_iterator = paginator.paginate(AccountId=account_id)
-                  count = 0
-                  with open(TMP_FILE, "w") as f:
-                      for budgets in response_iterator:
-                          if not 'Budgets' in budgets: continue
-                          for budget in budgets['Budgets']:
-                              count += 1
-                              budget['collection_time'] = collection_time
-                              logger.debug(budget)
-                              # Fetch tags for the budget using List tag for resource API
-                              budget_name = budget['BudgetName']
-                              resource_arn = f"arn:aws:budgets::{account_id}:budget/{budget_name}"
-                              budget_tag = budgets_client.list_tags_for_resource(
-                                  ResourceARN=f"{resource_arn}"
-                              )
-                              if budget_tag['ResourceTags'] is not None:
-                                budget.update({'Account_ID': account_id, 'Account_Name': account_name, 'Tags': budget_tag['ResourceTags']})
-                              else:
-                                budget.update({'Account_ID': account_id, 'Account_Name': account_name})
-                              # Fetch CostFilters if available
-                              if 'CostFilters' not in budget or len(budget['CostFilters']) == 0 or 'PlannedBudgetLimits' not in budget:
-                                  budget.update({'CostFilters': {'Filter': ['None']}})
-                              dataJSONData = json.dumps(budget, cls=DateTimeEncoder)
-                              f.write(dataJSONData)
-                              f.write("\n")
-                  logger.info(f"Budgets collected: {count}")
-                  s3_upload(account_id, payer_id)
-              except Exception as e:
-                  logger.warning(f"Error: {type(e)} {e}")
+              aws_partition = boto3.session.Session().get_partition_for_region(boto3.session.Session().region_name)
+              account = json.loads(event["account"])
+              account_id = account["account_id"]
+              account_name = account["account_name"]
+              payer_id = account["payer_id"]
+
+              logger.info(f"Collecting data for account: {account_id}")
+              budgets_client = assume_role(account_id, "budgets", "us-east-1") # must be us-east-1
+              count = 0
+              with open(TMP_FILE, "w", encoding='utf-8') as f:
+                  for budget in budgets_client.get_paginator("describe_budgets").paginate(AccountId=account_id).search('Budgets'):
+                      budget['collection_time'] = collection_time
+
+                      # Fetch tags for the budget using List tag for resource API
+                      budget_name = budget['BudgetName']
+                      budget_tags = budgets_client.list_tags_for_resource(ResourceARN=f"arn:{aws_partition}:budgets::{account_id}:budget/{budget_name}")
+                      budget.update({
+                          'Account_ID': account_id,
+                          'Account_Name': account_name,
+                          'Tags': budget_tags.get('ResourceTags') or []
+                      })
+
+                      # Fetch CostFilters if available
+                      if 'CostFilters' not in budget or len(budget['CostFilters']) == 0 or 'PlannedBudgetLimits' not in budget:
+                          budget.update({'CostFilters': {'Filter': ['None']}})
+
+                      f.write(json.dumps(budget, cls=DateTimeEncoder) + "\n")
+                      count += 1
+              logger.info(f"Budgets collected: {count}")
+              s3_upload(account_id, payer_id)
+
 
           def s3_upload(account_id, payer_id):
               if os.path.getsize(TMP_FILE) == 0:
                   logger.info(f"No data in file for {PREFIX}")
                   return
               key = datetime.datetime.now().strftime(f"{PREFIX}/{PREFIX}-data/payer_id={payer_id}/year=%Y/month=%m/budgets-{account_id}.json")
-              try:
-                  res = boto3.client('s3').upload_file(TMP_FILE, BUCKET, key)
-                  logger.info(f'res={res}')
-                  logger.info(f"Budget data for {account_id} stored at s3://{BUCKET}/{key}")
-              except Exception as exc:
-                  logger.warning(exc)
+              boto3.client('s3').upload_file(TMP_FILE, BUCKET, key)
+              logger.info(f"Budget data for {account_id} stored at s3://{BUCKET}/{key}")
+
       Handler: 'index.lambda_handler'
       MemorySize: 2688
       Timeout: 300
@@ -209,7 +202,7 @@ Resources:
         Variables:
           BUCKET_NAME: !Ref DestinationBucket
           PREFIX: !Ref CFDataName
-          ROLENAME: !Ref MultiAccountRoleName
+          ROLE_NAME: !Ref MultiAccountRoleName
 
     Metadata:
       cfn_nag:
diff --git a/data-collection/deploy/module-inventory.yaml b/data-collection/deploy/module-inventory.yaml
@@ -56,7 +56,7 @@ Parameters:
     Description: ARN of a Lambda for Managing GlueTable
   AwsObjects:
     Type: CommaDelimitedList
-    Default: OpensearchDomains, ElasticacheClusters, RdsDbInstances, EBS, AMI, Snapshot, Ec2Instances, VpcInstances, RdsDbSnapshots, EKSClusters, AWSLambda
+    Default: OpensearchDomains, ElasticacheClusters, RdsDbInstances, EBS, AMI, Snapshot, Ec2Instances, VpcInstances, RdsDbSnapshots, EKSClusters, LambdaFunctions
     Description: Services for pulling price data
 
 Mappings:
@@ -798,10 +798,10 @@ Mappings:
                 paths: Arn,Name,CreatedAt,Version,accountid,collection_date,region
               SerializationLibrary: org.openx.data.jsonserde.JsonSerDe
           TableType: EXTERNAL_TABLE
-    AWSLambda:
-      path: lambda
+    LambdaFunctions:
+      path: lambda-functions
       table:
-        - Name: inventory_lambda_data
+        - Name: inventory_lambda_functions_data
           Parameters: { "classification" : "json", "compressionType": "none" }
           PartitionKeys:
           - Name: payer_id
@@ -863,7 +863,7 @@ Mappings:
             - Name: vpcconfig
               Type: struct<subnetids:array<string>,securitygroupids:array<string>,vpcid:string,ipv6allowedfordualstack:boolean>
             InputFormat: org.apache.hadoop.mapred.TextInputFormat
-            Location: !Sub s3://${DestinationBucket}/inventory/inventory-lambda-data/
+            Location: !Sub s3://${DestinationBucket}/inventory/inventory-lambda-functions-data/
             OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             SerdeInfo:
               Parameters:
@@ -970,9 +970,7 @@ Resources:
               session = assume_session(account_id, region)
               client = session.client(service, region_name=region)
               try:
-                  paginator = client.get_paginator(function_name)
-                  for obj in paginator.paginate(**(params or {})).search(obj_name):
-                      yield obj
+                  yield from client.get_paginator(function_name).paginate(**(params or {})).search(obj_name)
               except Exception as exc:  #pylint: disable=broad-exception-caught
                   logger.info(f'Error in scan {function_name}/{account_id}: {exc}')
 
@@ -996,32 +994,32 @@ Resources:
                   logger.info(f'scan {service}/{account_id}/{region}: {exc}')
 
           def eks_clusters_scan(account_id, region):
-            """special function to scan EKS clusters"""
-            service = "eks"
-            session = assume_session(account_id, region)
-            client = session.client(service, region_name=region)
-            try:
-                for cluster_name in (
-                    client.get_paginator("list_clusters")
-                    .paginate(
-                        PaginationConfig={
-                            "PageSize": 100,
-                        }
-                    )
-                    .search("clusters")
-                ):
-                    cluster = client.describe_cluster(name=cluster_name)
-                    yield {
-                            "Arn": cluster["cluster"]["arn"],
-                            "Name": cluster["cluster"]["name"],
-                            "CreatedAt": datetime.strftime(
-                                cluster["cluster"]["createdAt"].astimezone(tz=timezone.utc), "%Y-%m-%dT%H:%M:%SZ"
-                            ),
-                            "Version": cluster["cluster"]["version"],
-                    }
-            except Exception as exc:
-                logger.error(f"Cannot get info from {account_id}/{region}: {type(exc)}-{exc}")
-                return []
+              """special function to scan EKS clusters"""
+              service = "eks"
+              session = assume_session(account_id, region)
+              client = session.client(service, region_name=region)
+              try:
+                  for cluster_name in (
+                      client.get_paginator("list_clusters")
+                      .paginate(
+                          PaginationConfig={
+                              "PageSize": 100,
+                          }
+                      )
+                      .search("clusters")
+                  ):
+                      cluster = client.describe_cluster(name=cluster_name)
+                      yield {
+                          "Arn": cluster["cluster"]["arn"],
+                          "Name": cluster["cluster"]["name"],
+                          "CreatedAt": datetime.strftime(
+                              cluster["cluster"]["createdAt"].astimezone(tz=timezone.utc), "%Y-%m-%dT%H:%M:%SZ"
+                          ),
+                          "Version": cluster["cluster"]["version"],
+                      }
+              except Exception as exc: #pylint: disable=W0718
+                  logger.error(f"Cannot get info from {account_id}/{region}: {type(exc)}-{exc}")
+              return []
 
           def lambda_handler(event, context): #pylint: disable=unused-argument
               """ this lambda collects ami, snapshots and volumes from linked accounts
@@ -1085,7 +1083,7 @@ Resources:
                       service='ec2',
                       function_name='describe_vpcs'
                   ),
-                  'lambda' : partial(
+                  'lambda-functions' : partial(
                     paginated_scan,
                     service='lambda',
                     function_name='list_functions',
@@ -1114,7 +1112,7 @@ Resources:
                                           obj[f"tag_{tag['Key']}"] = tag["Value"]
                               obj['collection_date'] = collection_date
                               obj['region'] = region
-                              if 'Environment' in obj and name == 'lambda':
+                              if 'Environment' in obj and name == 'lambda_functions':
                                   obj['Environment'] = to_json(obj['Environment']) # this property breaks crawler as it has a different key structure
                               file_.write(to_json(obj) + "\n")
                   logger.info(f"Collected {counter} total {name} instances")
diff --git a/data-collection/test/run-test-from-scratch.sh b/data-collection/test/run-test-from-scratch.sh
@@ -4,6 +4,7 @@
 # vars
 account_id=$(aws sts get-caller-identity --query "Account" --output text )
 bucket=cid-$account_id-test
+export bucket
 
 # upload files
 ./data-collection/utils/upload.sh  "$bucket"
diff --git a/data-collection/test/test_from_scratch.py b/data-collection/test/test_from_scratch.py
@@ -82,13 +82,13 @@ def test_inventory_vpc_data(athena):
     data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."inventory_vpc_data" LIMIT 10;')
     assert len(data) > 0, 'inventory_vpc_data is empty'
 
-def test_inventory_rds_snaphot_data(athena):
+def test_inventory_rds_snapshot_data(athena):
     data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."inventory_rds_db_snapshots_data" LIMIT 10;')
     assert len(data) > 0, 'inventory_rds_db_snapshots_data is empty'
 
-def test_inventory_lambda_data(athena):
-    data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."inventory_lambda_data" LIMIT 10;')
-    assert len(data) > 0, 'inventory_lambda_data is empty'
+def test_inventory_lambda_functions_data(athena):
+    data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."inventory_lambda_functions_data" LIMIT 10;')
+    assert len(data) > 0, 'inventory_lambda_functions_data is empty'
 
 def test_rds_usage_data(athena):
     data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."rds_usage_data" LIMIT 10;')
@@ -144,7 +144,7 @@ def test_pricing_rds_data(athena):
 
 def test_pricing_lambda_data(athena):
     data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."pricing_lambda_data" LIMIT 10;')
-    assert len(data) > 0, 'pricing_awslambda_data is empty'
+    assert len(data) > 0, 'pricing_lambda_data is empty'
 
 def test_pricing_regionnames_data(athena):
     data = athena_query(athena=athena, sql_query='SELECT * FROM "optimization_data"."pricing_regionnames_data" LIMIT 10;')
diff --git a/data-collection/test/utils.py b/data-collection/test/utils.py
@@ -346,7 +346,7 @@ def trigger_update(account_id):
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}inventory-Ec2Instances-StateMachine',
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}inventory-VpcInstances-StateMachine',
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}inventory-RdsDbSnapshots-StateMachine',
-        f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}inventory-AWSLambda-StateMachine',
+        f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}inventory-LambdaFunctions-StateMachine',
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}rds-usage-StateMachine',
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}transit-gateway-StateMachine',
         f'arn:aws:states:{region}:{account_id}:stateMachine:{PREFIX}trusted-advisor-StateMachine',