Skip to content

Commit 74be7f9

Browse files
author
Taniya Mathur
committed
Enhance deployment script with comprehensive failure analysis
- Add nested stack events retrieval for detailed root cause analysis - Fix CloudFormation prompt template to remove misleading examples - Add CloudWatch Logs policy cleanup to prevent size limit issues - Improve debug logging for better troubleshooting visibility
1 parent 889d0df commit 74be7f9

File tree

1 file changed

+90
-15
lines changed

1 file changed

+90
-15
lines changed

scripts/codebuild_deployment.py

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -325,10 +325,12 @@ def get_cloudformation_logs(stack_name):
325325
"""Get CloudFormation stack events for error analysis"""
326326
try:
327327
cf_client = boto3.client('cloudformation')
328+
all_failed_events = []
329+
330+
# Get events from main stack
328331
all_events = []
329332
next_token = None
330333

331-
# Paginate through all events
332334
while True:
333335
if next_token:
334336
response = cf_client.describe_stack_events(
@@ -345,20 +347,71 @@ def get_cloudformation_logs(stack_name):
345347
if not next_token:
346348
break
347349

348-
# Filter for failed events
349-
failed_events = []
350+
# Filter for failed events and extract nested stack ARNs
351+
nested_stack_arns = []
350352
for event in all_events:
351353
status = event.get('ResourceStatus', '')
352354
if 'FAILED' in status or 'ROLLBACK' in status:
353-
failed_events.append({
355+
all_failed_events.append({
356+
'stack_name': stack_name,
354357
'timestamp': event.get('Timestamp', '').isoformat() if event.get('Timestamp') else '',
355358
'resource_type': event.get('ResourceType', ''),
356359
'logical_id': event.get('LogicalResourceId', ''),
357360
'status': status,
358361
'reason': event.get('ResourceStatusReason', 'No reason provided')
359362
})
363+
364+
# Extract nested stack ARN from CREATE_FAILED events
365+
if (status == 'CREATE_FAILED' and
366+
event.get('ResourceType') == 'AWS::CloudFormation::Stack' and
367+
'Embedded stack arn:aws:cloudformation:' in event.get('ResourceStatusReason', '')):
368+
reason = event.get('ResourceStatusReason', '')
369+
start = reason.find('arn:aws:cloudformation:')
370+
end = reason.find(' was not successfully created')
371+
if start != -1 and end != -1:
372+
nested_arn = reason[start:end]
373+
nested_stack_arns.append(nested_arn)
374+
375+
# Get events from nested stacks
376+
for nested_arn in nested_stack_arns:
377+
try:
378+
nested_events = []
379+
next_token = None
380+
381+
while True:
382+
if next_token:
383+
response = cf_client.describe_stack_events(
384+
StackName=nested_arn,
385+
NextToken=next_token
386+
)
387+
else:
388+
response = cf_client.describe_stack_events(StackName=nested_arn)
389+
390+
events = response.get('StackEvents', [])
391+
nested_events.extend(events)
392+
393+
next_token = response.get('NextToken')
394+
if not next_token:
395+
break
396+
397+
# Add failed events from nested stack
398+
for event in nested_events:
399+
status = event.get('ResourceStatus', '')
400+
if 'FAILED' in status or 'ROLLBACK' in status:
401+
all_failed_events.append({
402+
'stack_name': nested_arn.split('/')[-2], # Extract stack name from ARN
403+
'timestamp': event.get('Timestamp', '').isoformat() if event.get('Timestamp') else '',
404+
'resource_type': event.get('ResourceType', ''),
405+
'logical_id': event.get('LogicalResourceId', ''),
406+
'status': status,
407+
'reason': event.get('ResourceStatusReason', 'No reason provided')
408+
})
409+
410+
except Exception:
411+
# Skip nested stacks we can't access
412+
continue
360413

361-
return failed_events
414+
return all_failed_events
362415

363416
except Exception as e:
364417
return [{'error': f"Failed to retrieve CloudFormation logs: {str(e)}"}]
@@ -439,31 +492,31 @@ def generate_deployment_summary(deployment_results, stack_prefix, template_url):
439492
# Second Bedrock call with CloudFormation logs
440493
print("🤖 Making second Bedrock call with CF logs...")
441494
cf_prompt = dedent(f"""
442-
CodeBuild logs were unclear. Analyze CloudFormation logs for root cause.
495+
Analyze CloudFormation error events to determine root cause of deployment failures.
443496
444497
Pattern Results:
445498
{json.dumps(deployment_results, indent=2)}
446499
447500
CloudFormation Error Events:
448501
{json.dumps(cf_logs, indent=2)}
449502
450-
Create detailed analysis:
503+
Search through the events and find CREATE_FAILED events. Determine the root cause based on ResourceStatusReason.
504+
505+
Provide analysis in this format:
451506
452507
🚀 DEPLOYMENT RESULTS
453508
454509
📋 Pattern Status:
455-
• Pattern 1 - BDA: FAILED - Lambda CREATE_FAILED (IAM permissions)
456-
• Pattern 2 - OCR: SUCCESS - Stack deployed successfully
510+
[Determine actual status from the data provided]
457511
458512
🔍 CloudFormation Root Cause:
459-
• Extract exact resource names and error messages
460-
• Identify specific failed resources (Lambda, IAM, S3, DynamoDB)
461-
• Focus on CREATE_FAILED, UPDATE_FAILED, ROLLBACK events
462-
• Analyze ResourceStatusReason for technical details
513+
• Find CREATE_FAILED events and extract ResourceStatusReason
514+
• Identify which specific resources failed to create
515+
• Analyze error messages for technical root cause
463516
464517
💡 Fix Commands:
465-
• Provide specific AWS CLI commands to fix issues
466-
Include IAM policy updates, resource cleanup commands
518+
• Provide specific AWS CLI commands based on actual failures found
519+
Focus on the resources that actually failed
467520
468521
Keep each bullet point under 75 characters.
469522
""")
@@ -546,6 +599,28 @@ def cleanup_single_stack(stack_name, pattern_name):
546599
except json.JSONDecodeError:
547600
print(f"[{pattern_name}] Failed to parse AppSync API IDs")
548601

602+
# Clean up CloudWatch Logs Resource Policy entries for this stack
603+
try:
604+
result = run_command("aws logs describe-resource-policies --query 'resourcePolicies[0].policyDocument' --output text", check=False)
605+
if result.returncode == 0 and result.stdout.strip():
606+
import json
607+
policy_doc = json.loads(result.stdout.strip())
608+
original_count = len(policy_doc.get('Statement', []))
609+
610+
# Remove statements that reference this stack
611+
policy_doc['Statement'] = [
612+
stmt for stmt in policy_doc.get('Statement', [])
613+
if stack_name not in stmt.get('Resource', '')
614+
]
615+
616+
new_count = len(policy_doc.get('Statement', []))
617+
if new_count < original_count:
618+
print(f"[{pattern_name}] Removing {original_count - new_count} CloudWatch Logs policy entries")
619+
updated_policy = json.dumps(policy_doc)
620+
run_command(f"aws logs put-resource-policy --policy-name AWSLogDeliveryWrite20150319 --policy-document '{updated_policy}'", check=False)
621+
except Exception as e:
622+
print(f"[{pattern_name}] Failed to clean up CloudWatch Logs policy: {e}")
623+
549624
# Clean up CloudWatch Logs Resource Policy only if stack-specific
550625
result = run_command(f"aws logs describe-resource-policies --query 'resourcePolicies[?contains(policyName, `{stack_name}`)].policyName' --output text", check=False)
551626
if result.stdout.strip():

0 commit comments

Comments
 (0)