diff --git a/data-collection/deploy/module-budgets.yaml b/data-collection/deploy/module-budgets.yaml index bb6eeb93..5dc68854 100644 --- a/data-collection/deploy/module-budgets.yaml +++ b/data-collection/deploy/module-budgets.yaml @@ -128,6 +128,8 @@ Resources: import datetime from json import JSONEncoder + from functools import lru_cache + import boto3 BUCKET = os.environ["BUCKET_NAME"] @@ -167,16 +169,41 @@ Resources: budget['CostFilters'] = cleaned_filters def assume_role(account_id, service, region): + session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping {service} client creation for account {account_id} due to STS failure.") + return None + return session.client(service, region_name=region) + + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" partition = boto3.session.Session().get_partition_for_region(region_name=region) - cred = boto3.client('sts', region_name=region).assume_role( - RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", - RoleSessionName="data_collection" - )['Credentials'] - return boto3.client( - service, - aws_access_key_id=cred['AccessKeyId'], - aws_secret_access_key=cred['SecretAccessKey'], - aws_session_token=cred['SessionToken'] + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] ) def lambda_handler(event, context): #pylint: disable=W0613 @@ -196,6 +223,9 @@ Resources: logger.info(f"Collecting data for account: {account_id}") try: budgets_client = assume_role(account_id, "budgets", "us-east-1") # must be us-east-1 + if not budgets_client: + logger.warning(f"Skipping Budgets for account {account_id} due to STS failure.") + return count = 0 with open(TMP_FILE, "w", encoding='utf-8') as f: for budget in budgets_client.get_paginator("describe_budgets").paginate(AccountId=account_id).search('Budgets'): diff --git a/data-collection/deploy/module-compute-optimizer.yaml b/data-collection/deploy/module-compute-optimizer.yaml index 7d8a3798..e61fa330 100644 --- a/data-collection/deploy/module-compute-optimizer.yaml +++ b/data-collection/deploy/module-compute-optimizer.yaml @@ -388,7 +388,7 @@ Resources: import json import logging from datetime import date - from functools import partial + from functools import partial, lru_cache BUCKET_PREFIX = os.environ["BUCKET_PREFIX"] INCLUDE_MEMBER_ACCOUNTS = os.environ.get("INCLUDE_MEMBER_ACCOUNTS", 'yes').lower() == 'yes' @@ -405,6 +405,37 @@ Resources: sys.path.insert(0,'/tmp/') import boto3 #pylint: disable=wrong-import-position + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=unused-argument logger.info(f"Event data {json.dumps(event)}") if 'account' not in event: @@ -414,56 +445,61 @@ Resources: ) account = json.loads(event["account"]) payer_id = account["account_id"] - try: - result_messages = [] - error_messages = [] - for region in REGIONS: - partition = boto3.session.Session().get_partition_for_region(region_name=region) - credentials = boto3.client('sts', region_name=region).assume_role( - RoleArn=f"arn:{partition}:iam::{payer_id}:role/{ROLE_NAME}", - RoleSessionName="data_collection" - )["Credentials"] - co = boto3.client( - "compute-optimizer", - region_name=region, - aws_access_key_id=credentials['AccessKeyId'], - aws_secret_access_key=credentials['SecretAccessKey'], - aws_session_token=credentials['SessionToken'], - ) - export_funcs = { - 'ec2_instance': partial(co.export_ec2_instance_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), - 'auto_scale': partial(co.export_auto_scaling_group_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), - 'lambda': co.export_lambda_function_recommendations, - 'ebs_volume': co.export_ebs_volume_recommendations, - 'ecs_service': co.export_ecs_service_recommendations, - 'license': co.export_license_recommendations, - 'rds_database': partial(co.export_rds_database_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), - 'idle': co.export_idle_recommendations, - } - bucket = BUCKET_PREFIX + '.' + region - logger.info(f"INFO: bucket={bucket}") - for name, func in export_funcs.items(): - try: - res = func( - includeMemberAccounts=INCLUDE_MEMBER_ACCOUNTS, - s3DestinationConfig={ - 'bucket': bucket, - 'keyPrefix': date.today().strftime( - f'compute_optimizer/compute_optimizer_{name}/payer_id={payer_id}/year=%Y/month=%-m' - ), - } - ) - result_messages.append(f"{region} {name} export queued. JobId: {res['jobId']}") - except co.exceptions.LimitExceededException: - result_messages.append(f"{region} {name} export is already in progress.") - except Exception as exc: #pylint: disable=broad-exception-caught - error_messages.append(f"ERROR: {region} {name} - {exc}") - if result_messages: - logger.info("Success:\n"+"\n".join(result_messages)) - if error_messages: - raise Exception(f"There were {len(error_messages)} errors, out of {len(result_messages) + len(error_messages)} exports: \n" + "\n".join(error_messages)) #pylint: disable=broad-exception-raised - except Exception as exc: #pylint: disable=broad-exception-caught - logger.error(f"Error {type(exc).__name__} with message {exc}") + + result_messages = [] + error_messages = [] + + for region in REGIONS: + logger.info(f"Processing region: {region}") + session = assume_session(payer_id, region) + if not session: + logger.warning(f"Skipping region {region} due to STS failure.") + continue + + try: + co = session.client("compute-optimizer", region_name=region) + except Exception as co_init_exc: + logger.error(f"Failed to initialise Compute Optimizer client in {region}: {co_init_exc}") + error_messages.append(f"{region} - Compute Optimizer client init failed: {co_init_exc}") + continue + + export_funcs = { + 'ec2_instance': partial(co.export_ec2_instance_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), + 'auto_scale': partial(co.export_auto_scaling_group_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), + 'lambda': co.export_lambda_function_recommendations, + 'ebs_volume': co.export_ebs_volume_recommendations, + 'ecs_service': co.export_ecs_service_recommendations, + 'license': co.export_license_recommendations, + 'rds_database': partial(co.export_rds_database_recommendations, recommendationPreferences={'cpuVendorArchitectures': ARCH}), + 'idle': co.export_idle_recommendations, + } + + bucket = BUCKET_PREFIX + '.' + region + logger.info(f"INFO: bucket={bucket}") + + for name, func in export_funcs.items(): + try: + res = func( + includeMemberAccounts=INCLUDE_MEMBER_ACCOUNTS, + s3DestinationConfig={ + 'bucket': bucket, + 'keyPrefix': date.today().strftime( + f'compute_optimizer/compute_optimizer_{name}/payer_id={payer_id}/year=%Y/month=%-m' + ), + } + ) + result_messages.append(f"{region} {name} export queued. JobId: {res['jobId']}") + except co.exceptions.LimitExceededException: + result_messages.append(f"{region} {name} export is already in progress.") + except Exception as exc: #pylint: disable=broad-exception-caught + error_messages.append(f"ERROR: {region} {name} - {exc}") + + if result_messages: + logger.info("Success:\n"+"\n".join(result_messages)) + if error_messages: + logger.warning(f"There were {len(error_messages)} errors out of {len(result_messages) + len(error_messages)} exports.") + for msg in error_messages: + logger.warning(msg) Handler: index.lambda_handler MemorySize: 2688 Timeout: 300 diff --git a/data-collection/deploy/module-cost-anomaly.yaml b/data-collection/deploy/module-cost-anomaly.yaml index 64fa77f9..9ac4134f 100644 --- a/data-collection/deploy/module-cost-anomaly.yaml +++ b/data-collection/deploy/module-cost-anomaly.yaml @@ -135,6 +135,8 @@ Resources: import logging from datetime import date, timedelta, datetime + from functools import lru_cache + import boto3 BUCKET = os.environ['BUCKET_NAME'] @@ -146,6 +148,37 @@ Resources: logger = logging.getLogger(__name__) logger.setLevel(getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=unused-argument logger.info(f"Incoming event: {json.dumps(event)}") key = "account" diff --git a/data-collection/deploy/module-cost-explorer-rightsizing.yaml b/data-collection/deploy/module-cost-explorer-rightsizing.yaml index 1f1dee79..7e0efd34 100644 --- a/data-collection/deploy/module-cost-explorer-rightsizing.yaml +++ b/data-collection/deploy/module-cost-explorer-rightsizing.yaml @@ -127,6 +127,8 @@ Resources: import logging from datetime import date + from functools import lru_cache + import boto3 logger = logging.getLogger(__name__) @@ -196,6 +198,37 @@ Resources: break store_data_to_s3(result, management_account_id) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): logger.info(f"Event data {json.dumps(event)}") if 'account' not in event: diff --git a/data-collection/deploy/module-health-events.yaml b/data-collection/deploy/module-health-events.yaml index ba83c1c3..319cf536 100644 --- a/data-collection/deploy/module-health-events.yaml +++ b/data-collection/deploy/module-health-events.yaml @@ -144,6 +144,7 @@ Resources: import jmespath import socket from datetime import date, datetime, timedelta, timezone + from functools import lru_cache import boto3 from botocore.config import Config @@ -324,6 +325,37 @@ Resources: active_region = split_active_endpoint[1] return active_region + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=unused-argument """ this lambda collects AWS Health Events data and must be called from the corresponding Step Function to orchestrate @@ -343,19 +375,13 @@ Resources: account = account if isinstance(account, dict) else json.loads(account) account_id = account["account_id"] region = get_active_health_region() - partition = boto3.session.Session().get_partition_for_region(region_name=region) - creds = boto3.client('sts').assume_role( - RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}", - RoleSessionName="data_collection" - )['Credentials'] - health_client = boto3.client( - 'health', - config=config, - region_name=region, - aws_access_key_id=creds['AccessKeyId'], - aws_secret_access_key=creds['SecretAccessKey'], - aws_session_token=creds['SessionToken'], - ) + + session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping Health Events for account {account_id} due to STS failure.") + return + + health_client = session.client('health', config=config, region_name=region) count = 0 if is_summary_mode: diff --git a/data-collection/deploy/module-inventory.yaml b/data-collection/deploy/module-inventory.yaml index 7e74ca33..120df985 100644 --- a/data-collection/deploy/module-inventory.yaml +++ b/data-collection/deploy/module-inventory.yaml @@ -1263,12 +1263,29 @@ Resources: @lru_cache(maxsize=100) def assume_session(account_id, region): - """assume role in account""" + """Assume role in account with fallback to global STS only if region is disabled""" partition = boto3.session.Session().get_partition_for_region(region_name=region) - credentials = boto3.client('sts', region_name=region).assume_role( - RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}" , - RoleSessionName="data_collection" - )['Credentials'] + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLENAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + return boto3.session.Session( aws_access_key_id=credentials['AccessKeyId'], aws_secret_access_key=credentials['SecretAccessKey'], @@ -1279,6 +1296,9 @@ Resources: """ paginated scan """ obj_name = obj_name or function_name.split('_')[-1].capitalize() + '[*]' session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping {service}/{function_name} in {region} due to STS failure.") + return client = session.client(service, region_name=region) try: yield from client.get_paginator(function_name).paginate(**(params or {})).search(obj_name) @@ -1289,6 +1309,9 @@ Resources: """ special treatment for opensearch_scan """ service = 'opensearch' session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping {service} in {region} due to STS failure.") + return client = session.client(service, region_name=region) try: domain_names = [name.get('DomainName') for name in client.list_domain_names().get('DomainNames', [])] @@ -1308,6 +1331,9 @@ Resources: """special function to scan EKS clusters""" service = "eks" session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping {service} in {region} due to STS failure.") + return client = session.client(service, region_name=region) try: for cluster_name in ( @@ -1363,6 +1389,9 @@ Resources: return try: session = assume_session(account_id, region) + if not session: + logger.warning(f"Skipping WorkSpaces in {region} due to STS failure.") + return client = session.client('workspaces', region_name=region) # Get WorkSpaces data workspaces_data = list(client.get_paginator('describe_workspaces').paginate().search('Workspaces[*]')) diff --git a/data-collection/deploy/module-license-manager.yaml b/data-collection/deploy/module-license-manager.yaml index aeb3f2bd..32d8e874 100644 --- a/data-collection/deploy/module-license-manager.yaml +++ b/data-collection/deploy/module-license-manager.yaml @@ -136,6 +136,8 @@ Resources: import json import logging from datetime import date + from functools import lru_cache + import boto3 # Initialize AWS clients @@ -227,6 +229,37 @@ Resources: logging.error(f"{management_account_id} : {exc}") return "Successful" + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=W0613 logger.info(f"Event data {json.dumps(event)}") if 'account' not in event: diff --git a/data-collection/deploy/module-marketplace.yaml b/data-collection/deploy/module-marketplace.yaml index aeb65fe8..acf657dd 100644 --- a/data-collection/deploy/module-marketplace.yaml +++ b/data-collection/deploy/module-marketplace.yaml @@ -152,6 +152,8 @@ Resources: from datetime import datetime, timezone # pylint: disable=wrong-import-order import urllib3 + from functools import lru_cache + import boto3 from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest @@ -265,6 +267,37 @@ Resources: raise TypeError(f"Object of type {type(obj)} is not JSON serializable") # ---------- handler ---------- + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): # pylint: disable=unused-argument if 'account' not in event: raise RuntimeError(f"Missing 'account' in event for {MODULE_NAME}") diff --git a/data-collection/deploy/module-organization.yaml b/data-collection/deploy/module-organization.yaml index 7e8c315a..cd9fea97 100644 --- a/data-collection/deploy/module-organization.yaml +++ b/data-collection/deploy/module-organization.yaml @@ -158,21 +158,47 @@ Resources: except Exception as exc: logger.warning(exc) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def process_management_acc(management_account_id): """Get info from management account and write to s3""" logger.info(f'Assuming role {ROLE} in {management_account_id}') - partition = boto3.session.Session().get_partition_for_region(region_name=REGIONS[0]) - cred = boto3.client('sts', region_name=REGIONS[0]).assume_role( - RoleArn=f"arn:{partition}:iam::{management_account_id}:role/{ROLE}", - RoleSessionName="data_collection" - )['Credentials'] - client = boto3.client( - "organizations", - region_name=REGIONS[0], - aws_access_key_id=cred['AccessKeyId'], - aws_secret_access_key=cred['SecretAccessKey'], - aws_session_token=cred['SessionToken'], - ) + + session = assume_session(management_account_id, REGIONS[0]) + if not session: + logger.warning(f"Skipping Organization data for account {management_account_id} due to STS failure.") + return + + client = session.client("organizations", region_name=REGIONS[0]) accounts = list(OrgController(client).iterate_accounts()) logger.debug(f'Uploading {len(accounts)} records') s3_upload(management_account_id, accounts) diff --git a/data-collection/deploy/module-rds-usage.yaml b/data-collection/deploy/module-rds-usage.yaml index 57ff345c..3b43cd44 100644 --- a/data-collection/deploy/module-rds-usage.yaml +++ b/data-collection/deploy/module-rds-usage.yaml @@ -132,6 +132,8 @@ Resources: from re import sub from datetime import datetime, timedelta, date + from functools import lru_cache + import boto3 from botocore.exceptions import ClientError from boto3.s3.transfer import S3Transfer @@ -267,6 +269,37 @@ Resources: region_name = region ) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): logger.info(f"Event: {event}") collection_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/data-collection/deploy/module-resilience-hub.yaml b/data-collection/deploy/module-resilience-hub.yaml index d6a77546..a66bca5f 100644 --- a/data-collection/deploy/module-resilience-hub.yaml +++ b/data-collection/deploy/module-resilience-hub.yaml @@ -145,7 +145,7 @@ Resources: import tempfile from datetime import datetime, timedelta from contextlib import contextmanager - from functools import partial + from functools import partial, lru_cache import boto3 @@ -207,6 +207,37 @@ Resources: except OSError as e: logger.info(f"Warning: Could not delete temporary file {temp_file.name}: {e}") + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=unused-argument,too-many-branches,too-many-locals,too-many-statements logger.info(f"Event: {event}") if 'account' not in event: diff --git a/data-collection/deploy/module-service-quotas.yaml b/data-collection/deploy/module-service-quotas.yaml index c822943f..fef3d689 100644 --- a/data-collection/deploy/module-service-quotas.yaml +++ b/data-collection/deploy/module-service-quotas.yaml @@ -144,6 +144,8 @@ Resources: import logging from datetime import date, datetime + from functools import lru_cache + import boto3 BUCKET = os.environ['BUCKET_NAME'] @@ -154,6 +156,37 @@ Resources: logger = logging.getLogger(__name__) logger.setLevel(getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): #pylint: disable=unused-argument logger.info(f"Incoming event: {json.dumps(event)}") key = "account" diff --git a/data-collection/deploy/module-support-cases.yaml b/data-collection/deploy/module-support-cases.yaml index 42e8efe0..468e87f6 100644 --- a/data-collection/deploy/module-support-cases.yaml +++ b/data-collection/deploy/module-support-cases.yaml @@ -140,6 +140,7 @@ Resources: import json import logging from datetime import date, timedelta, datetime + from functools import lru_cache import boto3 @@ -175,20 +176,46 @@ Resources: 'statusCode': 200 } - def get_client_with_role(role_name, account_id, service, region): - logger.debug(f"Attempting to get '{service}' client with role '{role_name}' from account '{account_id}' in region '{region}'") - credentials = boto3.client('sts').assume_role( - RoleArn=f"arn:aws:iam::{account_id}:role/{role_name}", - RoleSessionName="data_collection" - )['Credentials'] - logger.debug("Successfully assumed role, now getting client") - client = boto3.client( - service, - region_name = region, + @lru_cache(maxsize=100) + def assume_session(account_id, region, role_name): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{role_name}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{role_name}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( aws_access_key_id=credentials['AccessKeyId'], aws_secret_access_key=credentials['SecretAccessKey'], - aws_session_token=credentials['SessionToken'], + aws_session_token=credentials['SessionToken'] ) + + def get_client_with_role(role_name, account_id, service, region): + logger.debug(f"Attempting to get '{service}' client with role '{role_name}' from account '{account_id}' in region '{region}'") + session = assume_session(account_id, region, role_name) + if not session: + logger.warning(f"Skipping {service} client creation for account {account_id} due to STS failure.") + return None + + logger.debug("Successfully assumed role, now getting client") + client = session.client(service, region_name=region) logger.debug(f"Successfully created '{service}' client with role '{role_name}' from account '{account_id}' in region '{region}'") return client @@ -207,6 +234,9 @@ Resources: account_name = account.get("account_name", None) logger.debug(f"==> account_name: '{account.get("account_name", None)}'") support = get_client_with_role(role_name, account_id, region="us-east-1", service="support") + if not support: + logger.warning(f"Skipping Support Cases for account {account_id} due to STS failure.") + return s3 = boto3.client('s3') default_start_date = (datetime.now().date() - timedelta(days=365)).strftime('%Y-%m-%d') # Case communications are available for 12 months after creation. diff --git a/data-collection/deploy/module-trusted-advisor.yaml b/data-collection/deploy/module-trusted-advisor.yaml index 7b0ff586..5738bbe2 100644 --- a/data-collection/deploy/module-trusted-advisor.yaml +++ b/data-collection/deploy/module-trusted-advisor.yaml @@ -189,10 +189,27 @@ Resources: def assume_role(account_id, service, region, role): partition = boto3.session.Session().get_partition_for_region(region_name=region) - assumed = boto3.client('sts', region_name=region).assume_role( - RoleArn=f"arn:{partition}:iam::{account_id}:role/{role}", - RoleSessionName='data_collection' - ) + try: + sts_client = boto3.client('sts', region_name=region) + assumed = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{role}", + RoleSessionName='data_collection' + ) + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + assumed = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{role}", + RoleSessionName='data_collection' + ) + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + creds = assumed['Credentials'] return boto3.client(service, region_name=region, aws_access_key_id=creds['AccessKeyId'], @@ -207,6 +224,9 @@ Resources: def read_ta(account_id, account_name): with open(TMP_FILE, "w", encoding='utf-8') as f: support = assume_role(account_id, "support", REGIONS[0], ROLE_NAME) + if not support: + logger.warning(f"Skipping Trusted Advisor for account {account_id} due to STS failure.") + return checks = support.describe_trusted_advisor_checks(language="en")["checks"] for check in checks: #print(json.dumps(check)) @@ -243,6 +263,9 @@ Resources: """ Read recommendations and write to a file """ trustedadvisor = assume_role(account_id, "trustedadvisor", REGIONS[0], ROLE_NAME) + if not trustedadvisor: + logger.warning(f"Skipping Trusted Advisor Priority for account {account_id} due to STS failure.") + return try: # Get all checks metadata first for dynamic field mapping checks_metadata = {} diff --git a/data-collection/deploy/module-workspaces-metrics.yaml b/data-collection/deploy/module-workspaces-metrics.yaml index 8f2d7dc6..fd6ff702 100644 --- a/data-collection/deploy/module-workspaces-metrics.yaml +++ b/data-collection/deploy/module-workspaces-metrics.yaml @@ -210,6 +210,8 @@ Resources: from datetime import datetime, timedelta, date import concurrent.futures + from functools import lru_cache + import boto3 from botocore.exceptions import ClientError from boto3.s3.transfer import S3Transfer @@ -473,6 +475,37 @@ Resources: region_name=region ) + @lru_cache(maxsize=100) + def assume_session(account_id, region): + """Assume role in account with fallback to global STS only if region is disabled""" + partition = boto3.session.Session().get_partition_for_region(region_name=region) + try: + sts_client = boto3.client('sts', region_name=region) + credentials = sts_client.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except sts_client.exceptions.RegionDisabledException as region_exc: + logger.warning(f"STS region disabled for {region}, falling back to global STS: {region_exc}") + try: + global_sts = boto3.client('sts', region_name='us-east-1') + credentials = global_sts.assume_role( + RoleArn=f"arn:{partition}:iam::{account_id}:role/{ROLE_NAME}", + RoleSessionName="data_collection" + )['Credentials'] + except Exception as fallback_exc: + logger.error(f"Global STS fallback failed for {account_id}: {fallback_exc}") + return None + except Exception as exc: + logger.error(f"STS assume_role failed for {account_id} in {region}: {exc}") + return None + + return boto3.session.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] + ) + def lambda_handler(event, context): logger.info(f"Event: {event}") try: