Skip to content

Commit 9cdf403

Browse files
esc1144iakov-aws
andauthored
Instrumentation rollback update (#362)
* Data Collection logging phase 1 * codify new SF version name * tweak account collector logging * Sync account collector with fix for manual account list * Add backoff/retry to standalone state machine * add more retry buffer * Misc cleanup * Temporarily re-indent the over indent to make the diff easier to read * Switch to pre-defined table * Fix merge issue * Merge main, normalize linked sf code file names * Interim commit * Checkpoint * beta * Pre-main merge instrumentation * Post merge main and cleanup * Fix misplaced space insertion * Refine Health detail execution logging * Add policy condition for CW logging * Update data-collection/deploy/deploy-data-collection.yaml * Tweak to output error message * Scale back Lambda-based monitoring * Renenable testing deploy of CaseSummary * Misc cleanup for easier merge * Misc cleanup for easier merge, 2 * Health SF correction from reversion of code * Health mod cleanup * Merge for push * post-merge * fix parameter * Update data-collection/deploy/module-health-events.yaml --------- Co-authored-by: Iakov Gan <[email protected]> Co-authored-by: Iakov GAN <[email protected]>
1 parent 0fad6b8 commit 9cdf403

29 files changed

+1263
-229
lines changed

data-collection/deploy/account-collector.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ Resources:
128128
ZipFile: |
129129
import os
130130
import json
131+
import uuid
131132
import logging
133+
from datetime import datetime
132134
from functools import partial
133135
134136
import boto3
@@ -141,6 +143,9 @@ Resources:
141143
EXCLUDED_ACCOUNT_LIST_KEY = os.environ.get('EXCLUDED_ACCOUNT_LIST_KEY')
142144
EUC_ACCOUNTS = os.environ.get('EUC_ACCOUNT_IDS', '').strip()
143145
TMP_FILE = "/tmp/data.json"
146+
START_TIME = str(datetime.now().isoformat())
147+
LINKED_ACCOUNT_LIST_KEY = os.environ.get('LINKED_ACCOUNT_LIST_KEY', 'linked-account-list.json')
148+
PAYER_ACCOUNT_LIST_KEY = os.environ.get('PAYER_ACCOUNT_LIST_KEY', 'payer-account-list.json')
144149
145150
logger = logging.getLogger(__name__)
146151
logger.setLevel(getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO))
@@ -156,6 +161,8 @@ Resources:
156161
logger.error(message)
157162
raise Exception(message) #pylint: disable=broad-exception-raised
158163
164+
module = event.get("module", '').lower()
165+
params = event.get("params", '')
159166
functions = { # keep keys same as boto3 services
160167
'linked': iterate_linked_accounts,
161168
'euc': partial(iterate_accounts_with_filter, EUC_ACCOUNTS),
@@ -164,7 +171,7 @@ Resources:
164171
'compute-optimizer': partial(iterate_admins_accounts, 'compute-optimizer'),
165172
'backup': partial(iterate_admins_accounts, 'backup'),
166173
}
167-
account_type = event.get("Type", '').lower()
174+
account_type = event.get("type", '').lower()
168175
if account_type not in functions:
169176
raise Exception(f"Lambda event must have 'Type' parameter with value = ({list(functions.keys())})") #pylint: disable=broad-exception-raised
170177
@@ -173,6 +180,14 @@ Resources:
173180
count = 0
174181
f.write("[\n")
175182
for account in account_iterator():
183+
account['main_exe_uuid'] = event.get("main_exe_uuid", str(uuid.uuid4()))
184+
account['module'] = module
185+
account['bucket'] = BUCKET
186+
account['dc_account'] = boto3.client('sts').get_caller_identity()['Account']
187+
account['dc_region'] = boto3.session.Session().region_name
188+
account['params'] = params
189+
account['prefix'] = RESOURCE_PREFIX
190+
account['stack_version'] = event.get("stack_version", '')
176191
if count > 0:
177192
f.write(",\n")
178193
f.write(json.dumps(account))
@@ -183,6 +198,7 @@ Resources:
183198
raise Exception('No accounts found. Check the log.') #pylint: disable=broad-exception-raised
184199
185200
key = f"account-list/{account_type}-account-list.json"
201+
key = f"account-collector/{module+'-'+(params+'-' if params else '')+(LINKED_ACCOUNT_LIST_KEY if account_type == 'linked' else PAYER_ACCOUNT_LIST_KEY)}"
186202
s3 = boto3.client('s3')
187203
s3.upload_file(TMP_FILE, Bucket=BUCKET, Key=key)
188204
@@ -285,6 +301,7 @@ Resources:
285301
aws_secret_access_key=credentials['SecretAccessKey'],
286302
aws_session_token=credentials['SessionToken'],
287303
)
304+
288305
Handler: 'index.lambda_handler'
289306
MemorySize: 2688
290307
Timeout: 600
@@ -298,6 +315,7 @@ Resources:
298315
PREDEF_ACCOUNT_LIST_KEY: "account-list/account-list"
299316
EXCLUDED_ACCOUNT_LIST_KEY: "account-list/excluded-linked-account-list.csv"
300317
EUC_ACCOUNT_IDS: !Ref EUCAccountIDs
318+
301319
Metadata:
302320
cfn_nag:
303321
rules_to_suppress:
@@ -310,4 +328,4 @@ Resources:
310328
Type: AWS::Logs::LogGroup
311329
Properties:
312330
LogGroupName: !Sub "/aws/lambda/${LambdaFunction}"
313-
RetentionInDays: 60
331+
RetentionInDays: 60

data-collection/deploy/deploy-data-collection.yaml

Lines changed: 97 additions & 27 deletions
Large diffs are not rendered by default.

data-collection/deploy/module-aws-feeds.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -932,4 +932,4 @@ Resources:
932932
Type: Custom::LambdaAnalyticsExecutor
933933
Properties:
934934
ServiceToken: !Ref LambdaAnalyticsARN
935-
Name: !Ref CFDataName
935+
Name: !Ref CFDataName

data-collection/deploy/module-backup.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ Resources:
320320
DeployRegion: !Ref AWS::Region
321321
Account: !Ref AWS::AccountId
322322
Prefix: !Ref ResourcePrefix
323+
Bucket: !Ref DestinationBucket
323324
'RefreshSchedule${AwsObject}':
324325
Type: AWS::Scheduler::Schedule
325326
Properties:
@@ -338,4 +339,4 @@ Resources:
338339
Type: Custom::LambdaAnalyticsExecutor
339340
Properties:
340341
ServiceToken: !Ref LambdaAnalyticsARN
341-
Name: !Ref CFDataName
342+
Name: !Ref CFDataName

data-collection/deploy/module-budgets.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ Resources:
290290
DeployRegion: !Ref AWS::Region
291291
Account: !Ref AWS::AccountId
292292
Prefix: !Ref ResourcePrefix
293+
Bucket: !Ref DestinationBucket
293294

294295
ModuleRefreshSchedule:
295296
Type: 'AWS::Scheduler::Schedule'

data-collection/deploy/module-compute-optimizer.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ Resources:
501501
DeployRegion: !Ref AWS::Region
502502
Account: !Ref AWS::AccountId
503503
Prefix: !Ref ResourcePrefix
504+
Bucket: !Ref DestinationBucket
504505

505506
ModuleRefreshSchedule:
506507
Type: 'AWS::Scheduler::Schedule'

data-collection/deploy/module-cost-anomaly.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ Resources:
425425
DeployRegion: !Ref AWS::Region
426426
Account: !Ref AWS::AccountId
427427
Prefix: !Ref ResourcePrefix
428+
Bucket: !Ref DestinationBucket
428429

429430
ModuleRefreshSchedule:
430431
Type: "AWS::Scheduler::Schedule"

data-collection/deploy/module-cost-explorer-rightsizing.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ Resources:
262262
DeployRegion: !Ref AWS::Region
263263
Account: !Ref AWS::AccountId
264264
Prefix: !Ref ResourcePrefix
265+
Bucket: !Ref DestinationBucket
265266

266267
ModuleRefreshSchedule:
267268
Type: 'AWS::Scheduler::Schedule'

data-collection/deploy/module-ecs-chargeback.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ Resources:
274274
DeployRegion: !Ref AWS::Region
275275
Account: !Ref AWS::AccountId
276276
Prefix: !Ref ResourcePrefix
277+
Bucket: !Ref DestinationBucket
277278

278279
ModuleRefreshSchedule:
279280
Type: 'AWS::Scheduler::Schedule'

data-collection/deploy/module-health-events.yaml

Lines changed: 20 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ Parameters:
5151
Type: String
5252
Description: "ARNs of KMS Keys for data buckets and/or Glue Catalog. Comma separated list, no spaces. Keep empty if data Buckets and Glue Catalog are not Encrypted with KMS. You can also set it to '*' to grant decrypt permission for all the keys."
5353
Default: ""
54+
DetailStepFunctionTemplate:
55+
Type: String
56+
Description: JSON representation of the detail retrieval StepFunction template
5457

5558
Conditions:
5659
NeedDataBucketsKms: !Not [ !Equals [ !Ref DataBucketsKmsKeysArns, "" ] ]
@@ -327,16 +330,17 @@ Resources:
327330
"""
328331
logger.info(f"Event data: {event}")
329332
account = event.get('account')
330-
batch_input = event.get('BatchInput')
331-
items = event.get('Items')
332-
if not (account or batch_input):
333+
items = event.get('items')
334+
if not (account):
333335
raise ValueError(
334336
"Please do not trigger this Lambda manually."
335337
"Find the corresponding state machine in Step Functions and Trigger from there."
336338
)
337-
is_summary_mode = batch_input == None
339+
is_summary_mode = items is None
338340
logger.info(f"Executing in {'summary' if is_summary_mode else 'detail'} mode flow")
339-
account = json.loads(account) if is_summary_mode else batch_input.get('account')
341+
main_exe_uuid = event.get("main_exe_uuid", "")
342+
stack_version = event.get("stack_version", "")
343+
account = account if isinstance(account, dict) else json.loads(account)
340344
account_id = account["account_id"]
341345
region = get_active_health_region()
342346
partition = boto3.session.Session().get_partition_for_region(region_name=region)
@@ -387,7 +391,10 @@ Resources:
387391
"bucket": BUCKET_NAME,
388392
"file": key,
389393
"account": account,
390-
"ingestion_time": int(round(ingestion_time.timestamp()))
394+
"ingestion_time": int(round(ingestion_time.timestamp())),
395+
"main_exe_uuid": main_exe_uuid,
396+
"params": "detail",
397+
"stack_version": stack_version
391398
}
392399
sf_input = json.dumps(sf_input).replace('"', '\"') #need to escape the json for SF
393400
sf.start_execution(stateMachineArn=DETAIL_SM_ARN, input=sf_input)
@@ -400,7 +407,7 @@ Resources:
400407
logger.error(f"Error: {exc}")
401408
402409
elif items:
403-
ingestion_time = datetime.fromtimestamp(int(batch_input.get('ingestion_time')))
410+
ingestion_time = datetime.fromtimestamp(int(event.get('ingestion_time')))
404411
405412
with open(TMP_FILE, "w", encoding='utf-8') as f:
406413
for item in items:
@@ -479,6 +486,7 @@ Resources:
479486
DeployRegion: !Ref AWS::Region
480487
Account: !Ref AWS::AccountId
481488
Prefix: !Ref ResourcePrefix
489+
Bucket: !Ref DestinationBucket
482490

483491
ModuleRefreshSchedule:
484492
Type: 'AWS::Scheduler::Schedule'
@@ -500,81 +508,9 @@ Resources:
500508
StateMachineName: !Sub '${ResourcePrefix}${CFDataName}-detail-StateMachine'
501509
StateMachineType: STANDARD
502510
RoleArn: !Ref StepFunctionExecutionRoleARN
503-
DefinitionString: |
504-
{
505-
"Comment": "Collects Health Events",
506-
"StartAt": "DetailProcessor Map",
507-
"States": {
508-
"DetailProcessor Map": {
509-
"Type": "Map",
510-
"ItemProcessor": {
511-
"ProcessorConfig": {
512-
"Mode": "DISTRIBUTED",
513-
"ExecutionType": "STANDARD"
514-
},
515-
"StartAt": "DetailLambda Invoke",
516-
"States": {
517-
"DetailLambda Invoke": {
518-
"Type": "Task",
519-
"Resource": "arn:${Partition}:states:::lambda:invoke",
520-
"OutputPath": "$.Payload",
521-
"Parameters": {
522-
"Payload.$": "$",
523-
"FunctionName": "${ModuleLambdaARN}"
524-
},
525-
"Retry": [
526-
{
527-
"ErrorEquals": [
528-
"Lambda.ServiceException",
529-
"Lambda.AWSLambdaException",
530-
"Lambda.SdkClientException",
531-
"Lambda.TooManyRequestsException"
532-
],
533-
"IntervalSeconds": 1,
534-
"MaxAttempts": 3,
535-
"BackoffRate": 2
536-
}
537-
],
538-
"End": true
539-
}
540-
}
541-
},
542-
"Label": "DetailProcessorMap",
543-
"MaxConcurrency": ${MaxConcurrentBatches},
544-
"ItemReader": {
545-
"Resource": "arn:${Partition}:states:::s3:getObject",
546-
"ReaderConfig": {
547-
"InputType": "CSV",
548-
"CSVHeaderLocation": "FIRST_ROW"
549-
},
550-
"Parameters": {
551-
"Bucket.$": "$.bucket",
552-
"Key.$": "$.file"
553-
}
554-
},
555-
"ItemBatcher": {
556-
"MaxItemsPerBatch": ${ItemsPerBatch},
557-
"BatchInput": {
558-
"account.$": "$.account",
559-
"ingestion_time.$": "$.ingestion_time"
560-
}
561-
},
562-
"Next": "CrawlerStepFunctionStartExecution"
563-
},
564-
"CrawlerStepFunctionStartExecution": {
565-
"Type": "Task",
566-
"Resource": "arn:${Partition}:states:::states:startExecution.sync:2",
567-
"Parameters": {
568-
"StateMachineArn": "arn:${Partition}:states:${DeployRegion}:${Account}:stateMachine:${Prefix}CrawlerExecution-StateMachine",
569-
"Input": {
570-
"crawlers": ${Crawlers}
571-
}
572-
},
573-
"End": true
574-
}
575-
},
576-
"TimeoutSeconds": 14400
577-
}
511+
DefinitionS3Location:
512+
Bucket: !Ref CodeBucket
513+
Key: !Ref DetailStepFunctionTemplate
578514
DefinitionSubstitutions:
579515
ModuleLambdaARN: !GetAtt LambdaFunction.Arn
580516
Crawlers: !Sub '["${ResourcePrefix}${CFDataName}-detail-Crawler"]'
@@ -587,6 +523,7 @@ Resources:
587523
ItemsPerBatch: 50
588524
MaxConcurrentBatches: 1
589525
Partition: !Ref AWS::Partition
526+
Bucket: !Ref DestinationBucket
590527
Metadata:
591528
cfn-lint:
592529
config:
@@ -596,4 +533,4 @@ Resources:
596533
Type: Custom::LambdaAnalyticsExecutor
597534
Properties:
598535
ServiceToken: !Ref LambdaAnalyticsARN
599-
Name: !Ref CFDataName
536+
Name: !Ref CFDataName

0 commit comments

Comments
 (0)