From 90d823e84278eb2f7ee8a5a633c790f914f0a0d0 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Wed, 30 Oct 2024 10:19:15 -0700 Subject: [PATCH 1/6] Create README.md Initial checkin. --- neuron-problem-detector/ecs-npd-cdk/README.md | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 neuron-problem-detector/ecs-npd-cdk/README.md diff --git a/neuron-problem-detector/ecs-npd-cdk/README.md b/neuron-problem-detector/ecs-npd-cdk/README.md new file mode 100644 index 0000000..702392f --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/README.md @@ -0,0 +1,76 @@ + +# Welcome to your CDK Python project! + +This is a blank project for CDK development with Python. + +The `cdk.json` file tells the CDK Toolkit how to execute your app. + +This project is set up like a standard Python project. The initialization +process also creates a virtualenv within this project, stored under the `.venv` +directory. To create the virtualenv it assumes that there is a `python3` +(or `python` for Windows) executable in your path with access to the `venv` +package. If for any reason the automatic creation of the virtualenv fails, +you can create the virtualenv manually. + +## Pre-requisites +You will need `python3` and `cdk` utility installed on your machine. +To install `cdk` follow the instructions [here](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) + +## Environment Setup +To manually create a virtualenv on MacOS and Linux: + +``` +$ python3 -m venv .venv +``` + +After the init process completes and the virtualenv is created, you can use the following +step to activate your virtualenv. + +``` +$ source .venv/bin/activate +``` + +If you are a Windows platform, you would activate the virtualenv like this: + +``` +% .venv\Scripts\activate.bat +``` + +Once the virtualenv is activated, you can install the required dependencies. + +``` +$ pip install -r requirements.txt +``` + +## Synthesize CloudFormation template +At this point you can now synthesize the CloudFormation template for this code. + +``` +$ cdk synth +``` +It is assumed that you have authenticated successfully to connect to your AWS environment. + +Perform bootstrap function with the following command. +``` +cdk bootstrap [--profile ] +``` +Deploy the stack in your AWS environment + +``` +cdk deploy [--profile ] +``` + + +To add additional dependencies, for example other CDK libraries, just add +them to your `setup.py` file and rerun the `pip install -r requirements.txt` +command. + +## Useful commands + + * `cdk ls` list all stacks in the app + * `cdk synth` emits the synthesized CloudFormation template + * `cdk deploy` deploy this stack to your default AWS account/region + * `cdk diff` compare deployed stack with current state + * `cdk docs` open CDK documentation + +Enjoy! From 60bb71147244e5e39fa9b049afc4127a8a265488 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Wed, 30 Oct 2024 10:21:49 -0700 Subject: [PATCH 2/6] Init code-checkin Initialized folder and code. --- neuron-problem-detector/ecs-npd-cdk/app.py | 28 + .../ecs-npd-cdk/cdk.context.json | 10 + neuron-problem-detector/ecs-npd-cdk/cdk.json | 69 + .../ecs-npd-cdk/neuron.yaml | 1195 +++++++++++++++++ .../neuron_problem_detector/__init__.py | 0 .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 196 bytes ...ron_problem_detector_stack.cpython-311.pyc | Bin 0 -> 7409 bytes .../ecs_task_definition.json | 99 ++ .../neuron_problem_detector_stack.py | 174 +++ .../ecs-npd-cdk/requirements-dev.txt | 1 + .../ecs-npd-cdk/requirements.txt | 2 + .../ecs-npd-cdk/source.bat | 13 + .../ecs-npd-cdk/tests/__init__.py | 0 .../ecs-npd-cdk/tests/unit/__init__.py | 0 .../test_neuron_problem_detector_stack.py | 18 + .../ecs-npd-cdk/tools/check-ecs-exec.sh | 717 ++++++++++ 16 files changed, 2326 insertions(+) create mode 100644 neuron-problem-detector/ecs-npd-cdk/app.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/cdk.context.json create mode 100644 neuron-problem-detector/ecs-npd-cdk/cdk.json create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron.yaml create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__init__.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json create mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt create mode 100644 neuron-problem-detector/ecs-npd-cdk/requirements.txt create mode 100644 neuron-problem-detector/ecs-npd-cdk/source.bat create mode 100644 neuron-problem-detector/ecs-npd-cdk/tests/__init__.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/tests/unit/__init__.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py create mode 100644 neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh diff --git a/neuron-problem-detector/ecs-npd-cdk/app.py b/neuron-problem-detector/ecs-npd-cdk/app.py new file mode 100644 index 0000000..185bfa2 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/app.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import os + +import aws_cdk as cdk + +from neuron_problem_detector.neuron_problem_detector_stack import NeuronProblemDetectorStack + + +app = cdk.App() +NeuronProblemDetectorStack(app, "NeuronProblemDetectorStack", + # If you don't specify 'env', this stack will be environment-agnostic. + # Account/Region-dependent features and context lookups will not work, + # but a single synthesized template can be deployed anywhere. + + # Uncomment the next line to specialize this stack for the AWS Account + # and Region that are implied by the current CLI configuration. + + #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), + + # Uncomment the next line if you know exactly what Account and Region you + # want to deploy the stack to. */ + + # env=cdk.Environment(account='464616699298', region='us-east-1'), + + # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html + ) + +app.synth() diff --git a/neuron-problem-detector/ecs-npd-cdk/cdk.context.json b/neuron-problem-detector/ecs-npd-cdk/cdk.context.json new file mode 100644 index 0000000..6ab8567 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/cdk.context.json @@ -0,0 +1,10 @@ +{ + "availability-zones:account=464616699298:region=us-east-1": [ + "us-east-1a", + "us-east-1b", + "us-east-1c", + "us-east-1d", + "us-east-1e", + "us-east-1f" + ] +} diff --git a/neuron-problem-detector/ecs-npd-cdk/cdk.json b/neuron-problem-detector/ecs-npd-cdk/cdk.json new file mode 100644 index 0000000..20c5a8f --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/cdk.json @@ -0,0 +1,69 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__init__.py", + "**/__pycache__", + "tests" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, + "@aws-cdk/aws-kms:aliasNameRef": true, + "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, + "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, + "@aws-cdk/aws-efs:denyAnonymousAccess": true, + "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, + "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, + "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, + "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, + "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, + "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, + "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, + "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, + "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, + "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, + "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, + "@aws-cdk/aws-eks:nodegroupNameAttribute": true, + "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, + "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, + "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false + } +} diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml new file mode 100644 index 0000000..ec64fb8 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml @@ -0,0 +1,1195 @@ +Resources: + NeuronProblemDetectorVPC5F617726: + Type: AWS::EC2::VPC + Properties: + CidrBlock: 10.0.0.0/16 + EnableDnsHostnames: true + EnableDnsSupport: true + InstanceTenancy: default + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/Resource + NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: "" + CidrBlock: 10.0.0.0/18 + MapPublicIpOnLaunch: true + Tags: + - Key: aws-cdk:subnet-name + Value: Public + - Key: aws-cdk:subnet-type + Value: Public + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/Subnet + NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/RouteTable + NeuronProblemDetectorVPCPublicSubnet1RouteTableAssociation9EC2AFC5: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/RouteTableAssociation + NeuronProblemDetectorVPCPublicSubnet1DefaultRoute5C4F3954: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A + DependsOn: + - NeuronProblemDetectorVPCVPCGW5182937C + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/DefaultRoute + NeuronProblemDetectorVPCPublicSubnet1EIP71A9859B: + Type: AWS::EC2::EIP + Properties: + Domain: vpc + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/EIP + NeuronProblemDetectorVPCPublicSubnet1NATGateway34AE13E8: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: + Fn::GetAtt: + - NeuronProblemDetectorVPCPublicSubnet1EIP71A9859B + - AllocationId + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + DependsOn: + - NeuronProblemDetectorVPCPublicSubnet1DefaultRoute5C4F3954 + - NeuronProblemDetectorVPCPublicSubnet1RouteTableAssociation9EC2AFC5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/NATGateway + NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 1 + - Fn::GetAZs: "" + CidrBlock: 10.0.64.0/18 + MapPublicIpOnLaunch: true + Tags: + - Key: aws-cdk:subnet-name + Value: Public + - Key: aws-cdk:subnet-type + Value: Public + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/Subnet + NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/RouteTable + NeuronProblemDetectorVPCPublicSubnet2RouteTableAssociation9AFE0962: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/RouteTableAssociation + NeuronProblemDetectorVPCPublicSubnet2DefaultRoute80B8BD8F: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC + DependsOn: + - NeuronProblemDetectorVPCVPCGW5182937C + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/DefaultRoute + NeuronProblemDetectorVPCPublicSubnet2EIPEDE2DCF3: + Type: AWS::EC2::EIP + Properties: + Domain: vpc + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/EIP + NeuronProblemDetectorVPCPublicSubnet2NATGateway475CF308: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: + Fn::GetAtt: + - NeuronProblemDetectorVPCPublicSubnet2EIPEDE2DCF3 + - AllocationId + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76 + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + DependsOn: + - NeuronProblemDetectorVPCPublicSubnet2DefaultRoute80B8BD8F + - NeuronProblemDetectorVPCPublicSubnet2RouteTableAssociation9AFE0962 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/NATGateway + NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: "" + CidrBlock: 10.0.128.0/18 + MapPublicIpOnLaunch: false + Tags: + - Key: aws-cdk:subnet-name + Value: Private + - Key: aws-cdk:subnet-type + Value: Private + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/Subnet + NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/RouteTable + NeuronProblemDetectorVPCPrivateSubnet1RouteTableAssociationE6D42BF0: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B + SubnetId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/RouteTableAssociation + NeuronProblemDetectorVPCPrivateSubnet1DefaultRoute1AD8D623: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: + Ref: NeuronProblemDetectorVPCPublicSubnet1NATGateway34AE13E8 + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/DefaultRoute + NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 1 + - Fn::GetAZs: "" + CidrBlock: 10.0.192.0/18 + MapPublicIpOnLaunch: false + Tags: + - Key: aws-cdk:subnet-name + Value: Private + - Key: aws-cdk:subnet-type + Value: Private + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/Subnet + NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/RouteTable + NeuronProblemDetectorVPCPrivateSubnet2RouteTableAssociationCB083593: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0 + SubnetId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/RouteTableAssociation + NeuronProblemDetectorVPCPrivateSubnet2DefaultRoute7B853FC0: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: + Ref: NeuronProblemDetectorVPCPublicSubnet2NATGateway475CF308 + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/DefaultRoute + NeuronProblemDetectorVPCIGW3EC7DAA5: + Type: AWS::EC2::InternetGateway + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/IGW + NeuronProblemDetectorVPCVPCGW5182937C: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + InternetGatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/VPCGW + NeuronProblemDetectorVPCRestrictDefaultSecurityGroupCustomResource90BF6F18: + Type: Custom::VpcRestrictDefaultSG + Properties: + ServiceToken: + Fn::GetAtt: + - CustomVpcRestrictDefaultSGCustomResourceProviderHandlerDC833E5E + - Arn + DefaultSecurityGroupId: + Fn::GetAtt: + - NeuronProblemDetectorVPC5F617726 + - DefaultSecurityGroup + Account: + Ref: AWS::AccountId + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/RestrictDefaultSecurityGroupCustomResource/Default + CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: lambda.amazonaws.com + ManagedPolicyArns: + - Fn::Sub: arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Policies: + - PolicyName: Inline + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ec2:AuthorizeSecurityGroupIngress + - ec2:AuthorizeSecurityGroupEgress + - ec2:RevokeSecurityGroupIngress + - ec2:RevokeSecurityGroupEgress + Resource: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - ":ec2:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :security-group/ + - Fn::GetAtt: + - NeuronProblemDetectorVPC5F617726 + - DefaultSecurityGroup + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/Custom::VpcRestrictDefaultSGCustomResourceProvider/Role + CustomVpcRestrictDefaultSGCustomResourceProviderHandlerDC833E5E: + Type: AWS::Lambda::Function + Properties: + Code: + S3Bucket: + Fn::Sub: cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region} + S3Key: ee7de53d64cc9d6248fa6aa550f92358f6c907b5efd6f3298aeab1b5e7ea358a.zip + Timeout: 900 + MemorySize: 128 + Handler: __entrypoint__.handler + Role: + Fn::GetAtt: + - CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0 + - Arn + Runtime: + Fn::FindInMap: + - LatestNodeRuntimeMap + - Ref: AWS::Region + - value + Description: Lambda function for removing all inbound/outbound rules from the VPC default security group + DependsOn: + - CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/Custom::VpcRestrictDefaultSGCustomResourceProvider/Handler + aws:asset:path: asset.ee7de53d64cc9d6248fa6aa550f92358f6c907b5efd6f3298aeab1b5e7ea358a + aws:asset:property: Code + NeuronProblemDetectorClusterED21CFD2: + Type: AWS::ECS::Cluster + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceSecurityGroupC637EF03: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceSecurityGroup + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic by default + IpProtocol: "-1" + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceSecurityGroup/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ec2.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/AmazonSSMManagedInstanceCore + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceRole/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - ecs:DeregisterContainerInstance + - ecs:RegisterContainerInstance + - ecs:Submit* + Effect: Allow + Resource: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + - Action: + - ecs:Poll + - ecs:StartTelemetrySession + Condition: + ArnEquals: + ecs:cluster: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Effect: Allow + Resource: "*" + - Action: + - ecr:GetAuthorizationToken + - ecs:DiscoverPollEndpoint + - logs:CreateLogStream + - logs:PutLogEvents + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + Roles: + - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceProfile11E4E5E2: + Type: AWS::IAM::InstanceProfile + Properties: + Roles: + - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceProfile + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateData: + IamInstanceProfile: + Arn: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceProfile11E4E5E2 + - Arn + ImageId: + Ref: SsmParameterValueawsserviceecsoptimizedamiamazonlinux2infrecommendedimageidC96584B6F00A464EAD1953AFF4B05118Parameter + InstanceType: inf2.xlarge + Monitoring: + Enabled: false + SecurityGroupIds: + - Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceSecurityGroupC637EF03 + - GroupId + TagSpecifications: + - ResourceType: instance + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + - ResourceType: volume + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + UserData: + Fn::Base64: + Fn::Join: + - "" + - - |- + #!/bin/bash + echo ECS_CLUSTER= + - Ref: NeuronProblemDetectorClusterED21CFD2 + - " >> /etc/ecs/ecs.config" + TagSpecifications: + - ResourceType: launch-template + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + DependsOn: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + DesiredCapacity: "1" + LaunchTemplate: + LaunchTemplateId: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126 + Version: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126 + - LatestVersionNumber + MaxSize: "3" + MinSize: "1" + Tags: + - Key: Name + PropagateAtLaunch: true + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + VPCZoneIdentifier: + - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + UpdatePolicy: + AutoScalingReplacingUpdate: + WillReplace: true + AutoScalingScheduledAction: + IgnoreUnmodifiedGroupSizeProperties: true + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/ASG + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: lambda.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - ec2:DescribeHosts + - ec2:DescribeInstanceAttribute + - ec2:DescribeInstanceStatus + - ec2:DescribeInstances + Effect: Allow + Resource: "*" + - Action: autoscaling:CompleteLifecycleAction + Effect: Allow + Resource: + Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - ":autoscaling:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :autoScalingGroup:*:autoScalingGroupName/ + - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF + - Action: + - ecs:DescribeContainerInstances + - ecs:DescribeTasks + - ecs:ListTasks + - ecs:UpdateContainerInstancesState + Condition: + ArnEquals: + ecs:cluster: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Effect: Allow + Resource: "*" + - Action: + - ecs:ListContainerInstances + - ecs:SubmitContainerStateChange + - ecs:SubmitTaskStateChange + Effect: Allow + Resource: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + Roles: + - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D: + Type: AWS::Lambda::Function + Properties: + Code: + ZipFile: | + import boto3, json, os, time + + ecs = boto3.client('ecs') + autoscaling = boto3.client('autoscaling') + + + def lambda_handler(event, context): + print(json.dumps(dict(event, ResponseURL='...'))) + cluster = os.environ['CLUSTER'] + snsTopicArn = event['Records'][0]['Sns']['TopicArn'] + lifecycle_event = json.loads(event['Records'][0]['Sns']['Message']) + instance_id = lifecycle_event.get('EC2InstanceId') + if not instance_id: + print('Got event without EC2InstanceId: %s', json.dumps(dict(event, ResponseURL='...'))) + return + + instance_arn = container_instance_arn(cluster, instance_id) + print('Instance %s has container instance ARN %s' % (lifecycle_event['EC2InstanceId'], instance_arn)) + + if not instance_arn: + return + + task_arns = container_instance_task_arns(cluster, instance_arn) + + if task_arns: + print('Instance ARN %s has task ARNs %s' % (instance_arn, ', '.join(task_arns))) + + while has_tasks(cluster, instance_arn, task_arns): + time.sleep(10) + + try: + print('Terminating instance %s' % instance_id) + autoscaling.complete_lifecycle_action( + LifecycleActionResult='CONTINUE', + **pick(lifecycle_event, 'LifecycleHookName', 'LifecycleActionToken', 'AutoScalingGroupName')) + except Exception as e: + # Lifecycle action may have already completed. + print(str(e)) + + + def container_instance_arn(cluster, instance_id): + """Turn an instance ID into a container instance ARN.""" + arns = ecs.list_container_instances(cluster=cluster, filter='ec2InstanceId==' + instance_id)['containerInstanceArns'] + if not arns: + return None + return arns[0] + + def container_instance_task_arns(cluster, instance_arn): + """Fetch tasks for a container instance ARN.""" + arns = ecs.list_tasks(cluster=cluster, containerInstance=instance_arn)['taskArns'] + return arns + + def has_tasks(cluster, instance_arn, task_arns): + """Return True if the instance is running tasks for the given cluster.""" + instances = ecs.describe_container_instances(cluster=cluster, containerInstances=[instance_arn])['containerInstances'] + if not instances: + return False + instance = instances[0] + + if instance['status'] == 'ACTIVE': + # Start draining, then try again later + set_container_instance_to_draining(cluster, instance_arn) + return True + + task_count = None + + if task_arns: + # Fetch details for tasks running on the container instance + tasks = ecs.describe_tasks(cluster=cluster, tasks=task_arns)['tasks'] + if tasks: + # Consider any non-stopped tasks as running + task_count = sum(task['lastStatus'] != 'STOPPED' for task in tasks) + instance['pendingTasksCount'] + + if not task_count: + # Fallback to instance task counts if detailed task information is unavailable + task_count = instance['runningTasksCount'] + instance['pendingTasksCount'] + + print('Instance %s has %s tasks' % (instance_arn, task_count)) + + return task_count > 0 + + def set_container_instance_to_draining(cluster, instance_arn): + ecs.update_container_instances_state( + cluster=cluster, + containerInstances=[instance_arn], status='DRAINING') + + + def pick(dct, *keys): + """Pick a subset of a dict.""" + return {k: v for k, v in dct.items() if k in keys} + Environment: + Variables: + CLUSTER: + Ref: NeuronProblemDetectorClusterED21CFD2 + Handler: index.lambda_handler + Role: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + - Arn + Runtime: python3.9 + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Timeout: 310 + DependsOn: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionAllowInvokeNeuronProblemDetectorStackNeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A8A7A5064: + Type: AWS::Lambda::Permission + Properties: + Action: lambda:InvokeFunction + FunctionName: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - Arn + Principal: sns.amazonaws.com + SourceArn: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/AllowInvoke:NeuronProblemDetectorStackNeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionTopicBAF651D7: + Type: AWS::SNS::Subscription + Properties: + Endpoint: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - Arn + Protocol: lambda + TopicArn: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/Topic/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430: + Type: AWS::SNS::Topic + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Topic/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: autoscaling.amazonaws.com + Version: "2012-10-17" + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: sns:Publish + Effect: Allow + Resource: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + Roles: + - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookC7D53AF2: + Type: AWS::AutoScaling::LifecycleHook + Properties: + AutoScalingGroupName: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF + DefaultResult: CONTINUE + HeartbeatTimeout: 300 + LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING + NotificationTargetARN: + Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + RoleARN: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + - Arn + DependsOn: + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Resource + NeuronProblemDetectorTaskExecutionRole563D2650: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskExecutionRole/Resource + NeuronProblemDetectorTaskExecutionRoleDefaultPolicy8DBFC0EE: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - logs:CreateLogStream + - logs:PutLogEvents + Effect: Allow + Resource: + - Fn::GetAtt: + - NpdLogGroup39A02E3D + - Arn + - Fn::GetAtt: + - RecoveryLogGroupF6D50671 + - Arn + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorTaskExecutionRoleDefaultPolicy8DBFC0EE + Roles: + - Ref: NeuronProblemDetectorTaskExecutionRole563D2650 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskExecutionRole/DefaultPolicy/Resource + NeuronProblemDetectorTaskRole673752FB: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Version: "2012-10-17" + Policies: + - PolicyDocument: + Statement: + - Action: + - autoscaling:DescribeAutoScalingInstances + - autoscaling:SetInstanceHealth + - cloudwatch:PutMetricData + - ec2:DescribeInstances + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: node-recovery + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/Resource + NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - logs:CreateLogStream + - logs:DescribeLogGroups + - logs:DescribeLogStreams + - logs:PutLogEvents + - ssmmessages:CreateControlChannel + - ssmmessages:CreateDataChannel + - ssmmessages:OpenControlChannel + - ssmmessages:OpenDataChannel + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + Roles: + - Ref: NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/DefaultPolicy/Resource + NeuronNpdAndRecoveryTaskDef7591F251: + Type: AWS::ECS::TaskDefinition + Properties: + ContainerDefinitions: + - Command: + - echo '{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HANG_ON_COLLECTIVES","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json + EntryPoint: + - /bin/sh + - -c + Essential: true + Image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19 + LinuxParameters: + Capabilities: {} + Devices: + - ContainerPath: /dev/kmsg + HostPath: /dev/kmsg + Permissions: + - read + - write + LogConfiguration: + LogDriver: awslogs + Options: + awslogs-group: + Ref: NpdLogGroup39A02E3D + awslogs-stream-prefix: ecs + awslogs-region: + Ref: AWS::Region + Name: npd + PortMappings: + - AppProtocol: http + ContainerPort: 80 + HostPort: 80 + Name: npd-80-tcp + Protocol: tcp + Privileged: true + - Command: + - python scripts/check-health.py + EntryPoint: + - /bin/sh + - -c + Environment: + - Name: ENABLE_RECOVERY + Value: "true" + Essential: true + Image: public.ecr.aws/neuron/neuron-node-recovery:1.2.0 + LogConfiguration: + LogDriver: awslogs + Options: + awslogs-group: + Ref: RecoveryLogGroupF6D50671 + awslogs-stream-prefix: ecs + awslogs-region: + Ref: AWS::Region + Name: recovery + ReadonlyRootFilesystem: true + Cpu: "1024" + ExecutionRoleArn: + Fn::GetAtt: + - NeuronProblemDetectorTaskExecutionRole563D2650 + - Arn + Family: neuron-npd-and-recovery + Memory: "3072" + NetworkMode: awsvpc + RequiresCompatibilities: + - EC2 + TaskRoleArn: + Fn::GetAtt: + - NeuronProblemDetectorTaskRole673752FB + - Arn + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryTaskDef/Resource + NpdLogGroup39A02E3D: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: /ecs/npd + RetentionInDays: 7 + UpdateReplacePolicy: Retain + DeletionPolicy: Retain + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NpdLogGroup/Resource + RecoveryLogGroupF6D50671: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: /ecs/recovery + RetentionInDays: 7 + UpdateReplacePolicy: Retain + DeletionPolicy: Retain + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/RecoveryLogGroup/Resource + NeuronNpdAndRecoveryDaemonService03BA6456: + Type: AWS::ECS::Service + Properties: + Cluster: + Ref: NeuronProblemDetectorClusterED21CFD2 + DeploymentConfiguration: + MaximumPercent: 100 + MinimumHealthyPercent: 0 + EnableECSManagedTags: false + EnableExecuteCommand: true + LaunchType: EC2 + NetworkConfiguration: + AwsvpcConfiguration: + AssignPublicIp: DISABLED + SecurityGroups: + - Fn::GetAtt: + - NeuronNpdAndRecoveryDaemonServiceSecurityGroupC5B1D29B + - GroupId + Subnets: + - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + SchedulingStrategy: DAEMON + TaskDefinition: + Ref: NeuronNpdAndRecoveryTaskDef7591F251 + DependsOn: + - NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + - NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/Service + NeuronNpdAndRecoveryDaemonServiceSecurityGroupC5B1D29B: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/SecurityGroup + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic by default + IpProtocol: "-1" + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + DependsOn: + - NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + - NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/SecurityGroup/Resource + CDKMetadata: + Type: AWS::CDK::Metadata + Properties: + Analytics: v2:deflate64:H4sIAAAAAAAA/2VSXW/bMAz8LX1XtDkb+p5lXRcgK4Q46GtBK0zCRZYMfaQLDP/30vJcJ+0Tj2eaOh45l8X9d1ncwWuY6d1pZqiSbRlBnwRTLy3quWyfGy2We/uslkKlypAuU2Ux9tyENi5F3EJlcOInbhGC0wSRnH0v7sHDSvXhCeIjRHyFi1Cezgynxisb0TMeCwYl/7NFZK3HGm0UJerkKV4evUtN1nBDrCFZfdxi3RgY3r5lOoE6yHZpUuD3+u8j3EI4/cQ9WRrlf2ScjUAW/RW3Jpv+KfBQI/cI4kHPS/Rn0oM9A+wEpOiCBkP2INsFJ+WQvA/xiVvTHvVFG/zt3CmPcU10gqCW7cYNa8hROV5Z9m1CKxsiWI3Kuz0ZFmKgrnYg21/syTjmNVboawqBs04Ey0bxgoL21IwFN/nWNZRvJgPu7g78y9pNc42468QGg0u+N4b9dvWUsv4r/Aeahl34UMUDnGmH/gcEFHxlGPl6D7mORY/+53U6u8u76YS6xKOzX77JopDF17u/gWjmk41Uo9wM8Q0+wiv5GQMAAA== + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/CDKMetadata/Default + Condition: CDKMetadataAvailable +Mappings: + LatestNodeRuntimeMap: + af-south-1: + value: nodejs20.x + ap-east-1: + value: nodejs20.x + ap-northeast-1: + value: nodejs20.x + ap-northeast-2: + value: nodejs20.x + ap-northeast-3: + value: nodejs20.x + ap-south-1: + value: nodejs20.x + ap-south-2: + value: nodejs20.x + ap-southeast-1: + value: nodejs20.x + ap-southeast-2: + value: nodejs20.x + ap-southeast-3: + value: nodejs20.x + ap-southeast-4: + value: nodejs20.x + ap-southeast-5: + value: nodejs20.x + ap-southeast-7: + value: nodejs20.x + ca-central-1: + value: nodejs20.x + ca-west-1: + value: nodejs20.x + cn-north-1: + value: nodejs18.x + cn-northwest-1: + value: nodejs18.x + eu-central-1: + value: nodejs20.x + eu-central-2: + value: nodejs20.x + eu-north-1: + value: nodejs20.x + eu-south-1: + value: nodejs20.x + eu-south-2: + value: nodejs20.x + eu-west-1: + value: nodejs20.x + eu-west-2: + value: nodejs20.x + eu-west-3: + value: nodejs20.x + il-central-1: + value: nodejs20.x + me-central-1: + value: nodejs20.x + me-south-1: + value: nodejs20.x + mx-central-1: + value: nodejs20.x + sa-east-1: + value: nodejs20.x + us-east-1: + value: nodejs20.x + us-east-2: + value: nodejs20.x + us-west-1: + value: nodejs20.x + us-west-2: + value: nodejs20.x +Parameters: + SsmParameterValueawsserviceecsoptimizedamiamazonlinux2infrecommendedimageidC96584B6F00A464EAD1953AFF4B05118Parameter: + Type: AWS::SSM::Parameter::Value + Default: /aws/service/ecs/optimized-ami/amazon-linux-2/inf/recommended/image_id + BootstrapVersion: + Type: AWS::SSM::Parameter::Value + Default: /cdk-bootstrap/hnb659fds/version + Description: Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip] +Conditions: + CDKMetadataAvailable: + Fn::Or: + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - af-south-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-east-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-3 + - Fn::Equals: + - Ref: AWS::Region + - ap-south-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-south-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-3 + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-4 + - Fn::Equals: + - Ref: AWS::Region + - ca-central-1 + - Fn::Equals: + - Ref: AWS::Region + - ca-west-1 + - Fn::Equals: + - Ref: AWS::Region + - cn-north-1 + - Fn::Equals: + - Ref: AWS::Region + - cn-northwest-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-central-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-central-2 + - Fn::Equals: + - Ref: AWS::Region + - eu-north-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-south-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-south-2 + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - eu-west-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-west-2 + - Fn::Equals: + - Ref: AWS::Region + - eu-west-3 + - Fn::Equals: + - Ref: AWS::Region + - il-central-1 + - Fn::Equals: + - Ref: AWS::Region + - me-central-1 + - Fn::Equals: + - Ref: AWS::Region + - me-south-1 + - Fn::Equals: + - Ref: AWS::Region + - sa-east-1 + - Fn::Equals: + - Ref: AWS::Region + - us-east-1 + - Fn::Equals: + - Ref: AWS::Region + - us-east-2 + - Fn::Equals: + - Ref: AWS::Region + - us-west-1 + - Fn::Equals: + - Ref: AWS::Region + - us-west-2 + diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__init__.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6be5a9d166e3c7a55c95865450f4b1c782e569a8 GIT binary patch literal 196 zcmZ3^%ge<81Zu%b=^*+sh=2h`DC095kTIPhg&~+hlhJP_LlF~@{~09tD_lRcIJKx) zKQXa5BQ>))Lq9LIv?xC>zMv>SDJL~IJ|(pzHMu0eNIx~XSU0aAMK?Jm8>gE1_{_Y_ rlK6PNg34bUHo5sJr8%i~MXW%3KrSif2NEBc85tQrFu;f+W}p}V1z0m7 literal 0 HcmV?d00001 diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b151bd27e64c558a71073269052d786e62153489 GIT binary patch literal 7409 zcmb_hO>7&-6<+=?e?|RMltleGmT8NmtjMvg)@fymj*Z%sAUScXAla^VN7PE(U3Pcr zNUQ|(P@pQ1q&>Ju3izNyp`@@5x%A#+kF-I+iva`#D01+PLS&%msc&|bH1$e+ zJ3nt`-n{qby^q5$!r=e~&)KEX0vD&Kf0Byzcp8Oo^HBJV;wj!yq(sNELv${?ME9~s z^e+2Edf6`qmV*w`?<|JI@N(EeIjMIj-t}9GcO%EXi=rOE$9$F}yl06@dY_SM(u+My z8p~~9AN!HYAa2SksAfT6MNE^fqFhjM8!546O;$O!C`g5*1N*bGq-si;)40Y0#>?BN zd^-V!-+}OmT6XZ%vXgfp7w<%F-o?9Dy}Q(^gZKP4wCvfXHXX}eA{dV6QCX3t1w~#d zB5@9Bh|^@n6w{+}atYxOXRXQzJoYL`D=Si-Yi<%2ZHG?~KHqNoDC$p+&p;)b1L}=u zAKs6w3bo$s*>-F@SDga2?fQ%3Q-_1nEF%E#dUO75Fa_^CaM@M6ma}R?6tI7Z4_nWZLYs=xE0~rhrK;&?vMBJG*8=h zMYLo*y#FQM03SFA?!&5&nk$1w`Ye9XNZ2aF9z(XCh}FY~?Xzkzs|UunS>?T`Rcd|n z5&PJJDckxR)xT8KKu|rWSg(w>RBJKN~iP> zTKxs*!LZ$Tb^^B55*8=yab({Ka>2)Zc7of%y&c$99EehwMjT) zweO8urPen;W}lZo!@mJ1}NCn{#|pyJ^*-@2pi`AGb;e z=k4}CxM0`7cJN?gA0pnPX4KmWZHM;GS>s#ZHmxSD9zMmV`Ls<3=dHf=3s%Y2Pm+}J ztf#EfL1rJ~_oHSEw`=JY+Ml$|{NJ>H(LQfO`%|F(H~DGdhuGi}|JMI&gM-U2Ss-lF zblTDpSp3SiXWP${JaKg&a@nJ1KHG`#6lC+6JdT~1&fi0bqsZk_mMyWIplv-PIry*W zcat=B0#pc#EVm{|h!I4#062my%{7KXO^{yHqC8Rs1@SLh!U8}YR^kw&Z2<`BRaKEu zk*W%^#FUUC3Pw@IV;l?f$dbmwN(vw*jsq^HIa-m62%kN1vv*l_Be#jTvIdhb0$-&U zacW9~W;M->}9yziD z75f#W%4G!^*@_nShwiR*&aho6kU`5UC;(l|Me2Jfx_{9b{(cK(ct@KI; z0!>LJKE+CWb2jY0I5{#xAZ@#sLwDva-d9 z!U_(9wMwietO!MLP#i&)eHp_M{KU9MvEtA0=zeBHR12U&|C+373=CYuk)|`5^oDySd;Hs4BbTv5}4oG~<0GGTR!Z{X^aTW9W1rmv(g zUaj~tD}t0!*DB5w$KC+gg}60&c>LF+rDC}tNTV~OMBLGdQTT$e0J1_ua}!3Zys-k` zFQEC72=$dR_<*t`RM6KU0NRd!6CCZWjU>`VP%e-Lt(CEzjzaAwg4%keq znLY_j3Sw262+?R~)x4Zj6k`Gr&K$USvvZAAB^eo=J0HN%auKPmQxl!C61Z6F;?2>o zSkhRKSz4UE!@Qf%&Mz+JvUg{%-_9|)#l`tWmQE$Yg0hsIK$U` zjfVpRH6*d{T|5PGB!xChvI6dWZIvym2>Q%r(&`puX_1Vx4)L{DNPu0C)^L!7F1cVN z@Cu!QrVIowNL3M-hAEJ|&PWi`NEXrHWI{%P&M?F@#EMKwL94jU!#)@V@vDHNkQd6E5Ld0>SqXp)sVkEyjVnD{AP~ia z-G&scVHk8^MQBo_A*^aLCl_&uEtNpDMvb--QDw4JUV&7bMx2s{6rM3tOyiZZC9Ro@ z>8bQ&B~jYa*1%XK*OoLj!>u81BeiDa<8)~Yx8?Fin1OgR|6XqK7uXH=FbGGjY}V9l z*wCycNz$wQ1a_tTl#p2<7_@a?fEyQ*UvUg{%ga)63sRD-F{_|2)3i#K@Q#>~77W*t zH|JQ$cG8lGN0M&PxQW?tn~~diuoHxJupajE1}p9{J&G}NAp*2ym?)XS9<>bd4$}}Wv|Bu*%?MI63k!X^__;ek~}78!CW2LAN| zPHmY^8!@@&WDZalXtvF)!$p<}Ww1AA03z&}y?uNB9*#8Nho!4v-h>r*2&(YS-dkb- zG~;mAh9+FO>=gFSA%X;P)O;^Au_$aLH#>(t_ZDy6&Ed9}VM36o5FiJPRdLwRr-fQ^ zBx@BL+BEiRK?)q2{YV7|n}ZVt)^fvXQZa3L91bs9=XH+Vf{yuoj=7i1{S3F0MMwYv zjg?Ap{vZe}K=qDU!!+FbndiuLH@kp?v!znA#l5C!B^=0cQEI}%&796u?*m6ZUcgbat}nPjG2cRH2a+!*0go*h^#6BeE<3lrPHxb zJiqsS?5okS&%5_RRr)ggf8{mbK);E!e_Z$@UmbYs?~ZEZ`s2v;TIBk!>zkgw2OE25 zs^@O}BU|lRc-*s4>siSxaBlhgHX%0F^M1^t0zLh6J(AE1UtAFk=4 zs2-X=?2XrYFH|RTk9%*_dT)F~Rv392efvrDZM`cC>$G>kI_({Gmpc)u`~5?KBMQhd zAyvBnc@NB0OH3N$#=eeS)??S}9@k(`ou-DzK1YZBiH9?_{tNZ6uf5}l0&+~q^C;Ec z1zXVvr>g^(^r6ufPM@1_0y!q+dBEblTphUbD$e>~XDm{8brT7J91}t$OjG_)jqd+C zn$e>(AX|3?WQ+Cx5gm5+Jea96NJ-BJ;=)y}n(g!E1>DlVQ z^}65F4FiE36SC`vZS^N=@d;yCQtzBN>>GGERO_46qdj`>pkbg3U`@S0t@jMoI|KbN z9LO;tyKT?oRDAGZT<;mt`_IDAT3@QxH}$t{ZTe=*M06di~s_Ctz=)}6$sIt`y1+ncM#l65!L{(gJip^cAW&?6e$dQXC76Vc@ zNEekzJsq5RGB~4$dhC0-CQS?_|Xy7 zkpP*1RO$Hhb5w8qfn?YwqjwGI0~hqpUVUsF9Ilr*TraQ!*$tAvZy}se@SPd*I>Azq z*8>dsz6{^Opd4bDUzORS*`ttaTI>RpXGAL;ZUj&c6^jg$bSR&Y&_u3yeyV&5R3SfL z7=+wCs|*l`S=L|O{h`wn_*|${o>zJC~1{$Q<`PqvvqltEBd$kmeq5tCAW zP6)X>G%$-oZp9Rq5OS-jY!O0!QNh8MTNNAtByIkbrWz_Rw)9uZT#Rh=-}H}7jlHe> z8LC7B>K8!jE{DTWclsU9I`#TU7v&hKQp2tPb?+~o4)Dj5>0bAd(Q|xCzp;`307XvW A&;S4c literal 0 HcmV?d00001 diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json new file mode 100644 index 0000000..ee4eeab --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json @@ -0,0 +1,99 @@ +{ + "family": "neuron-npd-and-recovery", + "containerDefinitions": [ + { + "name": "npd", + "image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19", + "cpu": 0, + "portMappings": [ + { + "name": "npd-80-tcp", + "containerPort": 80, + "hostPort": 80, + "protocol": "tcp", + "appProtocol": "http" + } + ], + "essential": true, + "entryPoint": [ + "/bin/sh", + "-c" + ], + "command": [ + "echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HANG_ON_COLLECTIVES\",\"pattern\":\".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json" + ], + "environment": [], + "mountPoints": [], + "volumesFrom": [], + "linuxParameters": { + "devices": [ + { + "hostPath": "/dev/kmsg", + "containerPath": "/dev/kmsg", + "permissions": [ + "read", + "write" + ] + } + ] + }, + "privileged": true, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/npd", + "awslogs-create-group": "true", + "awslogs-region": "us-west-2", + "awslogs-stream-prefix": "ecs" + }, + "secretOptions": [] + }, + "systemControls": [] + }, + { + "name": "recovery", + "image": "public.ecr.aws/neuron/neuron-node-recovery:1.1.0", + "cpu": 0, + "portMappings": [], + "essential": true, + "entryPoint": [ + "/bin/sh", + "-c" + ], + "command": [ + "python scripts/check-health.py" + ], + "environment": [ + { + "name": "ENABLE_RECOVERY", + "value": "true" + } + ], + "mountPoints": [], + "volumesFrom": [], + "readonlyRootFilesystem": true, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "true", + "awslogs-group": "/ecs/recovery", + "awslogs-region": "us-west-2", + "awslogs-stream-prefix": "ecs" + } + }, + "systemControls": [] + } + ], + "executionRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole", + "taskRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole", + "networkMode": "awsvpc", + "requiresCompatibilities": [ + "EC2" + ], + "cpu": "1024", + "memory": "3072", + "runtimePlatform": { + "cpuArchitecture": "X86_64", + "operatingSystemFamily": "LINUX" + } +} \ No newline at end of file diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py new file mode 100644 index 0000000..5d46de8 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py @@ -0,0 +1,174 @@ +from aws_cdk import ( + # Duration, + Stack, + # aws_sqs as sqs, + aws_ec2 as ec2, + aws_ecs as ecs, + aws_iam as iam, + aws_logs as logs, + aws_autoscaling as autoscaling, +) +from constructs import Construct + + +class NeuronProblemDetectorStack(Stack): + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + vpc = ec2.Vpc(self, "NeuronProblemDetectorVPC", max_azs=2) + + ecs_cluster = ecs.Cluster(self, "NeuronProblemDetectorCluster", vpc=vpc) + + ecs_cluster.add_capacity( + id="NeruonAutoScalingGroupCapacity", + machine_image=ecs.EcsOptimizedImage.amazon_linux2( + ecs.AmiHardwareType.NEURON + ), + max_capacity=3, + min_capacity=1, + desired_capacity=1, + instance_type=ec2.InstanceType("inf2.xlarge"), + ssm_session_permissions=True, + can_containers_access_instance_role=True, + ) + + # Create the task execution role + task_execution_role = iam.Role( + self, + "NeuronProblemDetectorTaskExecutionRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + "service-role/AmazonECSTaskExecutionRolePolicy" + ), + ], + ) + + iam_policy_document = iam.PolicyDocument( + statements=[ + iam.PolicyStatement( + actions=[ + "autoscaling:SetInstanceHealth", + "autoscaling:DescribeAutoScalingInstances", + ], + resources=["*"], + effect=iam.Effect.ALLOW, + ), + iam.PolicyStatement( + actions=["ec2:DescribeInstances"], + resources=["*"], + effect=iam.Effect.ALLOW, + ), + iam.PolicyStatement( + actions=["cloudwatch:PutMetricData"], + resources=["*"], + effect=iam.Effect.ALLOW + ), + ] + ) + + iam.PolicyStatement( + actions=[ + "autoscaling:SetInstanceHealth", + "autoscaling:DescribeAutoScalingInstances", + ], + resources=["*"], + effect=iam.Effect.ALLOW, + ) + + # Create a task role (if needed) + task_role = iam.Role( + self, + "NeuronProblemDetectorTaskRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + inline_policies={"node-recovery": iam_policy_document}, + ) + + # Create an ECS Task Definition + task_definition = ecs.TaskDefinition( + self, + "NeuronNpdAndRecoveryTaskDef", + family="neuron-npd-and-recovery", + network_mode=ecs.NetworkMode.AWS_VPC, + cpu="1024", + memory_mib="3072", + compatibility=ecs.Compatibility.EC2, + execution_role=task_execution_role, + task_role=task_role + ) + + # Create the device mapping + device_mapping = ecs.Device( + host_path="/dev/kmsg", + container_path="/dev/kmsg", + permissions=[ecs.DevicePermission.READ, ecs.DevicePermission.WRITE], + ) + + linux_parameters = ecs.LinuxParameters( + self, + "NpdLinuxParameters", + ) + + linux_parameters.add_devices(device_mapping) + + npd_container = task_definition.add_container( + "npd", + image=ecs.ContainerImage.from_registry( + "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19" + ), + entry_point=["/bin/sh", "-c"], + command=[ + 'echo \'{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HANG_ON_COLLECTIVES","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}\' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json' + ], + privileged=True, + logging=ecs.AwsLogDriver( + stream_prefix="ecs", + log_group=logs.LogGroup( + self, + "NpdLogGroup", + log_group_name="/ecs/npd", + retention=logs.RetentionDays.ONE_WEEK, + ), + ), + linux_parameters=linux_parameters, + ) + + npd_container.add_port_mappings( + ecs.PortMapping( + name="npd-80-tcp", + container_port=80, + host_port=80, + protocol=ecs.Protocol.TCP, + app_protocol=ecs.AppProtocol.http, + ) + ) + + recovery_container = task_definition.add_container( + "recovery", + image=ecs.ContainerImage.from_registry( + "public.ecr.aws/neuron/neuron-node-recovery:1.2.0" + ), + entry_point=["/bin/sh", "-c"], + command=["python scripts/check-health.py"], + environment={"ENABLE_RECOVERY": "true"}, + readonly_root_filesystem=True, + logging=ecs.AwsLogDriver( + stream_prefix="ecs", + log_group=logs.LogGroup( + self, + "RecoveryLogGroup", + log_group_name="/ecs/recovery", + retention=logs.RetentionDays.ONE_WEEK, + ), + ), + ) + + ec2_service = ecs.Ec2Service( + self, + "NeuronNpdAndRecoveryDaemonService", + cluster=ecs_cluster, + task_definition=task_definition, + daemon=True, + enable_execute_command=True, + ) diff --git a/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt b/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt new file mode 100644 index 0000000..9270945 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt @@ -0,0 +1 @@ +pytest==6.2.5 diff --git a/neuron-problem-detector/ecs-npd-cdk/requirements.txt b/neuron-problem-detector/ecs-npd-cdk/requirements.txt new file mode 100644 index 0000000..d5307a6 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/requirements.txt @@ -0,0 +1,2 @@ +aws-cdk-lib==2.152.0 +constructs>=10.0.0,<11.0.0 diff --git a/neuron-problem-detector/ecs-npd-cdk/source.bat b/neuron-problem-detector/ecs-npd-cdk/source.bat new file mode 100644 index 0000000..9e1a834 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/source.bat @@ -0,0 +1,13 @@ +@echo off + +rem The sole purpose of this script is to make the command +rem +rem source .venv/bin/activate +rem +rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. +rem On Windows, this command just runs this batch file (the argument is ignored). +rem +rem Now we don't need to document a Windows command for activating a virtualenv. + +echo Executing .venv\Scripts\activate.bat for you +.venv\Scripts\activate.bat diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/__init__.py b/neuron-problem-detector/ecs-npd-cdk/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/unit/__init__.py b/neuron-problem-detector/ecs-npd-cdk/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py new file mode 100644 index 0000000..d90e6b1 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py @@ -0,0 +1,18 @@ +import aws_cdk as core +import aws_cdk.assertions as assertions + +from neuron_problem_detector.neuron_problem_detector_stack import NeuronProblemDetectorStack + +# example tests. To run these tests, uncomment this file along with the example +# resource in neuron_problem_detector/neuron_problem_detector_stack.py +def test_sqs_queue_created(): + app = core.App() + stack = NeuronProblemDetectorStack(app, "neuron-problem-detector") + template = assertions.Template.from_stack(stack) + + template.has_resource_properties("AWS::ECS::Cluster",{}) + + +# template.has_resource_properties("AWS::SQS::Queue", { +# "VisibilityTimeout": 300 +# }) diff --git a/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh b/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh new file mode 100644 index 0000000..2a692a7 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh @@ -0,0 +1,717 @@ +#!/usr/bin/env bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# shellcheck disable=SC2059 + +CHECKER_VERSION=v0.7 + +# Script Name: check-ecs-exec.sh +# Usage : bash ./check-ecs-exec.sh + +set -euo pipefail + +## NOTE: Checks in this script are mainly based on: +## +## "Using Amazon ECS Exec for debugging - Amazon Elastic Container Service" +## https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html +## +## "NEW – Using Amazon ECS Exec to access your containers on AWS Fargate and Amazon EC2" +## https://aws.amazon.com/blogs/containers/new-using-amazon-ecs-exec-access-your-containers-fargate-ec2/ +## + +## NOTE: This script at least needs the following permissions. +## 1. If you use an IAM user with an assumed role to run the script, +## then you need to allow the "iam:ListRoles" action in addition to the following. +## 2. If you configured your ECS cluster to use KMS key for ECS Exec, +## then you need to allow the "kms:DescribeKey" action in addition to the following. +## { +## "Version": "2012-10-17", +## "Statement": [ +## { +## "Effect": "Allow", +## "Action": [ +## "iam:GetInstanceProfile", +## "iam:SimulatePrincipalPolicy", +## "ec2:DescribeSubnets", +## "ec2:DescribeVpcEndpoints", +## "ecs:DescribeClusters", +## "ecs:DescribeContainerInstances", +## "ecs:DescribeTaskDefinition", +## "ecs:DescribeTasks" +## ], +## "Resource": "*" +## } +## ] +## } + +# If you have multiple AWS CLI binaries, v1 and v2 for instance, you can choose which AWS CLI binary to use by setting the AWS_CLI_BIN env var. +# e.g. AWS_CLI_BIN=aws-v1 ./check-ecs-exec.sh YOUR_ECS_CLUSTER_NAME YOUR_ECS_TASK_ID +AWS_CLI_BIN=${AWS_CLI_BIN:-aws} + +# Force AWS CLI output format to json to use jq to parse its output +export AWS_DEFAULT_OUTPUT=json + +# Colors for output +COLOR_DEFAULT='\033[0m' +COLOR_RED='\033[0;31m' +COLOR_YELLOW='\033[1;33m' +COLOR_GREEN='\033[0;32m' + +# Validation for required parameters +CLUSTER_NAME=${1:-None} # A cluster name or a full ARN of the cluster +TASK_ID=${2:-None} # A task ID or a full ARN of the task +if [[ "${CLUSTER_NAME}" = "None" || "${TASK_ID}" = "None" ]]; then + printf "${COLOR_RED}Usage:\n" >&2 + printf " ./check-ecs-exec.sh YOUR_ECS_CLUSTER_NAME YOUR_ECS_TASK_ID${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +#### Functions +printSectionHeaderLine() { + printf "${COLOR_DEFAULT}-------------------------------------------------------------\n" +} +equalsOrGreaterVersion() { + required=$1 + current=$2 + if [[ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]]; then + return + fi + false +} +getRoleArnForAssumedRole() { + callerIdentityJson=$1 + ROLE_ID=$(echo "${callerIdentityJson}" | jq -r ".UserId" | cut -d: -f1) + aws iam list-roles --query "Roles[?RoleId=='${ROLE_ID}'].Arn" --output text +} +# For `iam simulate-principal-policy` +readEvalDecision() { + evalResultsJson=$1 + actionName=$2 + echo "${evalResultsJson}" | jq -r --arg ACTION_NAME "$actionName" '.EvaluationResults[] | select(.EvalActionName==$ACTION_NAME) | .EvalDecision' +} +showEvalResult() { + evalResult=$1 + actionName=$2 + printf "${COLOR_DEFAULT} ${actionName}: " + if [[ "${evalResult}" = "allowed" ]]; then + printf "${COLOR_GREEN}${evalResult}\n" + else + printf "${COLOR_RED}${evalResult}\n" + fi +} + +## 1. CHECK PREREQUISITES FOR check-ecs-exec.sh ########################################## +printSectionHeaderLine +printf "${COLOR_DEFAULT}Prerequisites for check-ecs-exec.sh ${CHECKER_VERSION}\n" +printSectionHeaderLine +########################################################################################## + +# Check if jq command exists +command -v jq >/dev/null 2>&1 && status="$?" || status="$?" +if [[ ! "${status}" = 0 ]]; then + printf "${COLOR_RED}Pre-flight check failed: \`jq\` command is missing${COLOR_DEFAULT}\n" >&2 + exit 1 +fi +printf "${COLOR_DEFAULT} jq | ${COLOR_GREEN}OK ${COLOR_DEFAULT}($(which jq))\n" + +# Check if aws command exists +command -v "${AWS_CLI_BIN}" >/dev/null 2>&1 && status="$?" || status="$?" +if [[ ! "${status}" = 0 ]]; then + printf "${COLOR_RED}Pre-flight check failed: \`${AWS_CLI_BIN}\` command is missing${COLOR_DEFAULT}\n" >&2 + exit 1 +fi +printf "${COLOR_DEFAULT} AWS CLI | ${COLOR_GREEN}OK ${COLOR_DEFAULT}($(which "${AWS_CLI_BIN}"))\n" + +# Find AWS region +REGION=$(${AWS_CLI_BIN} configure get region | tr -d "\r" || echo "") +export AWS_REGION=${AWS_REGION:-$REGION} +# Check region configuration in "source_profile" if the user uses MFA configurations +source_profile=$(${AWS_CLI_BIN} configure get source_profile || echo "") +if [ "${AWS_REGION}" = "" ] && [ "${source_profile}" != "" ]; then + region=$(${AWS_CLI_BIN} configure get region --profile "${source_profile}" || echo "") + export AWS_REGION="${region}" +fi +if [[ "${AWS_REGION}" = "" ]]; then + printf "${COLOR_RED}Pre-flight check failed: Missing AWS region. Use the \`aws configure set default.region\` command or set the \"AWS_REGION\" environment variable.${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +## 2. CHECK PREREQUISITES FOR USING ECS EXEC FEATURE VIA AWS CLI ######################### +printf "\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Prerequisites for the AWS CLI to use ECS Exec\n" +printSectionHeaderLine +########################################################################################## + +# MFA +AWS_MFA_SERIAL=${AWS_MFA_SERIAL:-$(${AWS_CLI_BIN} configure get mfa_serial || echo "")} +ROLE_TO_BE_ASSUMED=$(${AWS_CLI_BIN} configure get role_arn || echo "") +SOURCE_PROFILE=$(${AWS_CLI_BIN} configure get source_profile || echo "") +# Normally we don't need to ask MFA code thanks to the AWS CLI +# but we do need to prompt explicitly if the "AWS_MFA_SERIAL" value only exists without "role_arn" and "source_profile" +if [ "${AWS_MFA_SERIAL}" != "" ] && [ "${ROLE_TO_BE_ASSUMED}" == "" ] && [ "${SOURCE_PROFILE}" == "" ]; then + # Prpmpt users to enter MFA code to obtain temporary credentials + mfa_code="" + while true; do + printf "\n" + printf "Type MFA code for ${AWS_MFA_SERIAL}: " + read -rs mfa_code + if [ -z "${mfa_code}" ]; then + printf "${COLOR_RED}MFA code cannot be empty${COLOR_DEFAULT}" + continue + fi + break + done + + tmpCreds=$(${AWS_CLI_BIN} sts get-session-token --serial-number "${AWS_MFA_SERIAL}" --token-code "${mfa_code}") + accessKey=$( echo "${tmpCreds}" | jq -r .Credentials.AccessKeyId ) + secretKey=$( echo "${tmpCreds}" | jq -r .Credentials.SecretAccessKey ) + sessionToken=$( echo "${tmpCreds}" | jq -r .Credentials.SessionToken ) + export AWS_ACCESS_KEY_ID="${accessKey}" + export AWS_SECRET_ACCESS_KEY="${secretKey}" + export AWS_SESSION_TOKEN="${sessionToken}" +fi + +# Find caller identity +callerIdentityJson=$(${AWS_CLI_BIN} sts get-caller-identity) +ACCOUNT_ID=$(echo "${callerIdentityJson}" | jq -r ".Account") +CALLER_IAM_ARN=$(echo "${callerIdentityJson}" | jq -r ".Arn") +case "${CALLER_IAM_ARN}" in + *:user/*|*:role/*|*:group/* ) MY_IAM_ARN="${CALLER_IAM_ARN}";; + *:assumed-role/*) MY_IAM_ARN=$(getRoleArnForAssumedRole "${callerIdentityJson}");; + * ) printf "${COLOR_RED}Pre-flight check failed: The ARN \"${CALLER_IAM_ARN}\" associated with the caller(=you) is not supported. Try again either with one of an IAM user, an IAM role, or an assumed IAM role.${COLOR_DEFAULT}\n" >&2 && exit 1;; +esac +if [[ "${MY_IAM_ARN}" = "" ]]; then + printf "${COLOR_RED}Unknown error: Failed to get the role ARN of the caller(=you).${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +# Check task existence +describedTaskJson=$(${AWS_CLI_BIN} ecs describe-tasks \ + --cluster "${CLUSTER_NAME}" \ + --tasks "${TASK_ID}" \ + --output json) +existTask=$(echo "${describedTaskJson}" | jq -r ".tasks[0].taskDefinitionArn") +if [[ "${existTask}" = "null" ]]; then + printf "${COLOR_RED}Pre-flight check failed: The specified ECS task does not exist.\n\ +Make sure the parameters you have specified for cluster \"${CLUSTER_NAME}\" and task \"${TASK_ID}\" are both valid.${COLOR_DEFAULT}\n" + exit 1 +fi + +# Check whether the AWS CLI v1.19.28/v2.1.30 or later exists +executeCommandEnabled=$(echo "${describedTaskJson}" | jq -r ".tasks[0].enableExecuteCommand") +if [[ "${executeCommandEnabled}" = "null" ]]; then + printf "${COLOR_RED}Pre-flight check failed: ECS Exec requires the AWS CLI v1.19.28/v2.1.30 or later.\n\ +Please update the AWS CLI and try again?\n\ + For v2: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html\n\ + For v1: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv1.html${COLOR_DEFAULT}\n" + exit 1 +fi +awsCliVersion=$(${AWS_CLI_BIN} --version 2>&1 | tr -d "\r") +printf "${COLOR_DEFAULT} AWS CLI Version | ${COLOR_GREEN}OK ${COLOR_DEFAULT}(${awsCliVersion})\n" + +# Check whether the Session Manager plugin exists +printf "${COLOR_DEFAULT} Session Manager Plugin | " +command -v session-manager-plugin >/dev/null 2>&1 && status="$?" || status="$?" +if [[ "${status}" = 0 ]]; then + smpVersion=$(session-manager-plugin --version) + printf "${COLOR_GREEN}OK ${COLOR_DEFAULT}(${smpVersion})\n" +else + # https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html + printf "${COLOR_RED}Missing\n" +fi + +## 3. CHECK CLUSTER AND TASK CONFIGURATIONS ############################################## +printf "\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Checks on ECS task and other resources\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Region : ${AWS_REGION}\n" +printf "${COLOR_DEFAULT}Cluster: ${CLUSTER_NAME}\n" +printf "${COLOR_DEFAULT}Task : ${TASK_ID}\n" +printSectionHeaderLine +########################################################################################## + +# 1. Checks on the cluster configurations (yellow) +describedClusterJson=$(${AWS_CLI_BIN} ecs describe-clusters \ + --clusters "${CLUSTER_NAME}" \ + --include CONFIGURATIONS \ + --output json) +executeCommandConfigurationJson=$(echo "${describedClusterJson}" \ + | jq ".clusters[0].configuration.executeCommandConfiguration") + +printf "${COLOR_DEFAULT} Cluster Configuration |" + +kmsKeyId="null" +kmsKeyArn="null" +logging="null" +s3BucketName="null" +s3KeyPrefix="null" +s3Encryption="null" +cloudWatchLogGroupName="null" +cloudWatchLogEncryptionEnabled="null" +if [[ "${executeCommandConfigurationJson}" = "null" ]]; then + printf "${COLOR_YELLOW} Audit Logging Not Configured" +else + printf "\n" + + kmsKeyId=$(echo "${executeCommandConfigurationJson}" | jq -r ".kmsKeyId") + printf "${COLOR_DEFAULT} KMS Key : " + if [[ "${kmsKeyId}" = "null" ]]; then + printf "${COLOR_YELLOW}Not Configured" + else + printf "${kmsKeyId}" + kmsKeyArn=$(${AWS_CLI_BIN} kms describe-key --key-id "${kmsKeyId}" --query 'KeyMetadata.Arn' --output text) + fi + printf "\n" + + logging=$(echo "${executeCommandConfigurationJson}" | jq -r ".logging") + printf "${COLOR_DEFAULT} Audit Logging : " + if [[ "${logging}" = "null" ]]; then + printf "${COLOR_YELLOW}Not Configured" + elif [[ "${logging}" = "NONE" ]]; then + printf "${COLOR_YELLOW}Disabled" + else + printf "${logging}" + fi + printf "\n" + + s3BucketName=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3BucketName") + s3KeyPrefix=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3KeyPrefix") + s3Encryption=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3EncryptionEnabled") + printf "${COLOR_DEFAULT} S3 Bucket Name: " + if [[ "${s3BucketName}" = "null" ]]; then + printf "Not Configured" + else + printf "${s3BucketName}" + if [[ ! "${s3KeyPrefix}" = "null" ]]; then + printf ", Key Prefix: ${s3KeyPrefix}" + fi + printf ", Encryption Enabled: ${s3Encryption}" + fi + printf "\n" + + cloudWatchLogGroupName=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.cloudWatchLogGroupName") + cloudWatchLogEncryptionEnabled=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.cloudWatchEncryptionEnabled") + printf "${COLOR_DEFAULT} CW Log Group : " + if [[ "${cloudWatchLogGroupName}" = "null" ]]; then + printf "Not Configured" + else + printf "${cloudWatchLogGroupName}" + printf ", Encryption Enabled: ${cloudWatchLogEncryptionEnabled}" + fi +fi +printf "\n" + +# 2. Check whether "I" can call ecs:ExecuteCommand +printf "${COLOR_DEFAULT} Can I ExecuteCommand? | ${MY_IAM_ARN}\n" +ecsExecuteCommand="ecs:ExecuteCommand" +ecsExecEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${ecsExecuteCommand}" \ + --resource-arns "arn:aws:ecs:${AWS_REGION}:${ACCOUNT_ID}:task/${CLUSTER_NAME}/${TASK_ID}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") +showEvalResult "${ecsExecEvalResult}" "${ecsExecuteCommand}" +if [[ ! "${kmsKeyId}" = "null" ]]; then + kmsGenerateDataKey="kms:GenerateDataKey" + kmsGenerateDataKeyResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${kmsGenerateDataKey}" \ + --resource-arns "${kmsKeyArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${kmsGenerateDataKeyResult}" "${kmsGenerateDataKey}" +fi +## Check for ensuring "I cannot" call ssm:StartSession (yellow) +### See the "Limiting access to the Start Session action" section at https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html#ecs-exec-limit-access-start-session +ssmStartSession="ssm:StartSession" +printf "${COLOR_DEFAULT} ${ssmStartSession} denied?: " +ssmSessionEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${ssmStartSession}" \ + --resource-arns "arn:aws:ecs:${AWS_REGION}:${ACCOUNT_ID}:task/${CLUSTER_NAME}/${TASK_ID}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") +if [[ "${ssmSessionEvalResult}" = "allowed" ]]; then + printf "${COLOR_YELLOW}" +else + printf "${COLOR_GREEN}" +fi +printf "${ssmSessionEvalResult}\n" + +# 3. Check the task is in RUNNING state +printf "${COLOR_DEFAULT} Task Status | " +taskStatus=$(echo "${describedTaskJson}" | jq -r ".tasks[0].lastStatus") +stoppedReason=$(echo "${describedTaskJson}" | jq -r ".tasks[0].stoppedReason") +case "${taskStatus}" in + RUNNING ) printf "${COLOR_GREEN}${taskStatus}";; + PROVISIONING|ACTIVATING|PENDING ) printf "${COLOR_YELLOW}${taskStatus}";; + DEACTIVATING|STOPPING|DEPROVISIONING ) printf "${COLOR_RED}${taskStatus}";; + STOPPED ) printf "${COLOR_RED}${taskStatus} (${stoppedReason})";; + * ) printf "${COLOR_RED}${taskStatus}";; +esac +printf "${COLOR_DEFAULT}\n" + +# 4. Check the launch type, platform version, ecs-agent version +launchType=$(echo "${describedTaskJson}" | jq -r ".tasks[0].launchType") +describedContainerInstanceJson="" +printf "${COLOR_DEFAULT} Launch Type | " +if [[ "${launchType}" = "FARGATE" ]]; then # For FARGATE Launch Type + printf "${COLOR_GREEN}Fargate\n" + # Check the PV + printf "${COLOR_DEFAULT} Platform Version | " + + # Detect platform family to use correct platform version required + pf=$(echo "${describedTaskJson}" | jq -r ".tasks[0].platformFamily") + if [[ ${pf} == *"Windows"* ]]; then + requiredPV="1.0.0" #1.0.0 minimum for windows + else + requiredPV="1.4.0" #1.4.0 for others + fi + + pv=$(echo "${describedTaskJson}" | jq -r ".tasks[0].platformVersion") + if equalsOrGreaterVersion "${requiredPV}" "${pv}"; then + printf "${COLOR_GREEN}${pv}" + else + printf "${COLOR_RED}${pv} (Required: >= ${requiredPV})" + fi + printf "\n" +elif [[ "${launchType}" = "EC2" ]]; then # For EC2 Launch Type + printf "${COLOR_GREEN}EC2\n" + # Check the ECS-Agent version + containerInstanceArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containerInstanceArn") + requiredAgentVersion="1.50.2" + describedContainerInstanceJson=$(${AWS_CLI_BIN} ecs describe-container-instances \ + --cluster "${CLUSTER_NAME}" \ + --container-instance "${containerInstanceArn}" \ + --output json) + agentVersion=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].versionInfo.agentVersion") + printf "${COLOR_DEFAULT} ECS Agent Version | " + if equalsOrGreaterVersion "${requiredAgentVersion}" "${agentVersion}"; then + printf "${COLOR_GREEN}${agentVersion}" + else + printf "${COLOR_RED}${agentVersion} (Required: >= ${requiredAgentVersion})" + fi + printf "\n" +else + printf "${COLOR_YELLOW}UNKNOWN\n" +fi + +# 5. Check whether the `execute-command` option is enabled for the task +printf "${COLOR_DEFAULT} Exec Enabled for Task | " +if [[ "${executeCommandEnabled}" = "true" ]]; then + printf "${COLOR_GREEN}OK" +else + printf "${COLOR_RED}NO" +fi +printf "${COLOR_DEFAULT}\n" + +# 6. Check the managed agents' status +printf "${COLOR_DEFAULT} Container-Level Checks | \n" +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Managed Agent Status" +if [[ "${executeCommandEnabled}" = "false" ]]; then + printf " - ${COLOR_YELLOW}SKIPPED\n" + printf "${COLOR_DEFAULT} ----------\n" +else + printf "\n" + printf "${COLOR_DEFAULT} ----------\n" + agentsStatus=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[].managedAgents[].lastStatus") + idx=0 + for _ in $agentsStatus; do + containerName=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].name") + status=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].lastStatus") + reason=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].reason") + lastStartedAt=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].lastStartedAt") + printf " $((idx+1)). " + case "${status}" in + *STOPPED* ) printf "${COLOR_RED}STOPPED (Reason: ${reason})";; + *PENDING* ) printf "${COLOR_YELLOW}PENDING";; + * ) printf "${COLOR_GREEN}RUNNING";; + esac + printf "${COLOR_DEFAULT} for \"${containerName}\"" + if [[ "${status}" = "STOPPED" ]]; then + printf " - LastStartedAt: ${lastStartedAt}" + fi + printf "\n" + idx=$((idx+1)) + done +fi + +# 7. Check the "initProcessEnabled" flag added in the task definition (yellow) +taskDefArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].taskDefinitionArn") +taskDefJson=$(${AWS_CLI_BIN} ecs describe-task-definition \ + --task-definition "${taskDefArn}" \ + --output json) +taskDefFamily=$(echo "${taskDefJson}" | jq -r ".taskDefinition.family") +taskDefRevision=$(echo "${taskDefJson}" | jq -r ".taskDefinition.revision") +initEnabledList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].linuxParameters.initProcessEnabled") +idx=0 +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Init Process Enabled (${taskDefFamily}:${taskDefRevision})\n" +printf "${COLOR_DEFAULT} ----------\n" +for enabled in $initEnabledList; do + containerName=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].name") + printf " $((idx+1)). " + case "${enabled}" in + *true* ) printf "${COLOR_GREEN}Enabled";; + *false* ) printf "${COLOR_YELLOW}Disabled";; + * ) printf "${COLOR_YELLOW}Disabled";; + esac + printf "${COLOR_DEFAULT} - \"${containerName}\"\n" + idx=$((idx+1)) +done + +# 8. Check the "readonlyRootFilesystem" flag added in the task definition (red) +readonlyRootFsList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].readonlyRootFilesystem") +idx=0 +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Read-Only Root Filesystem (${taskDefFamily}:${taskDefRevision})\n" +printf "${COLOR_DEFAULT} ----------\n" +for enabled in $readonlyRootFsList; do + containerName=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].name") + printf " $((idx+1)). " + case "${enabled}" in + *false* ) printf "${COLOR_GREEN}Disabled";; + *true* ) printf "${COLOR_RED}ReadOnly";; + * ) printf "${COLOR_GREEN}Disabled";; + esac + printf "${COLOR_DEFAULT} - \"${containerName}\"\n" + idx=$((idx+1)) +done + +# 9. Check the task role permissions +overriddenTaskRole=true +taskRoleArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].overrides.taskRoleArn") +if [[ "${taskRoleArn}" = "null" ]]; then + overriddenTaskRole=false + taskRoleArn=$(echo "${taskDefJson}" | jq -r ".taskDefinition.taskRoleArn") +fi + +hasRole=true +isEC2Role=false +if [[ "${taskRoleArn}" = "null" ]]; then + ## When the task runs on EC2 without a task role then we should check the instance profile + if [[ "${launchType}" = "EC2" ]]; then + ec2InstanceId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].ec2InstanceId") + instanceProfileArn=$(${AWS_CLI_BIN} ec2 describe-instances --instance-ids "${ec2InstanceId}" | jq -r ".Reservations[0].Instances[0].IamInstanceProfile.Arn") + if [[ "${instanceProfileArn}" = "null" ]]; then + hasRole=false + else + instanceProfileName=$(echo "${instanceProfileArn}" | sed 's/arn:aws:iam::.*:instance-profile\///g') + taskRoleArn=$(${AWS_CLI_BIN} iam get-instance-profile \ + --instance-profile-name "${instanceProfileName}" \ + | jq -r ".InstanceProfile.Roles[0].Arn") + if [[ "${taskRoleArn}" = "null" ]]; then + hasRole=false + else + isEC2Role=true + fi + fi + else + ## Fargate launch type doesn't support to use EC2 instance roles + hasRole=false + fi +fi + +if [[ ! "${hasRole}" = "true" ]]; then + printf "${COLOR_DEFAULT} EC2 or Task Role | ${COLOR_RED}Not Configured\n" +else + if [[ "${isEC2Role}" = "true" ]]; then + printf "${COLOR_DEFAULT} EC2 Role Permissions | " + else + printf "${COLOR_DEFAULT} Task Role Permissions | " + fi + printf "${taskRoleArn}" + if [[ "${overriddenTaskRole}" = "true" ]]; then + printf " (Overridden)" + fi + printf "\n" + ## Required Permissions + ### SSM + ssm="ssmmessages:" + ssmCreateControlChannel="${ssm}CreateControlChannel" + ssmCreateDataChannel="${ssm}CreateDataChannel" + ssmOpenControlChannel="${ssm}OpenControlChannel" + ssmOpenDataChannel="${ssm}OpenDataChannel" + + ssmEvalResultsJson=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${ssmCreateControlChannel}" "${ssmCreateDataChannel}" "${ssmOpenControlChannel}" "${ssmOpenDataChannel}" \ + --output json) + ssmCreateControlChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmCreateControlChannel}") + showEvalResult "${ssmCreateControlChannelResult}" "${ssmCreateControlChannel}" + ssmCreateDataChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmCreateDataChannel}") + showEvalResult "${ssmCreateDataChannelResult}" "${ssmCreateDataChannel}" + ssmOpenControlChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmOpenControlChannel}") + showEvalResult "${ssmOpenControlChannelResult}" "${ssmOpenControlChannel}" + ssmOpenDataChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmOpenDataChannel}") + showEvalResult "${ssmOpenDataChannelResult}" "${ssmOpenDataChannel}" + + ## Optional Permissions (Might be required if audit-logging is enabled) + ### KMS + if [[ ! "${kmsKeyId}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + kmsDecrypt="kms:Decrypt" + kmsEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${kmsDecrypt}" \ + --resource-arns "${kmsKeyArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${kmsEvalResult}" "${kmsDecrypt}" + fi + ### S3 Bucket + if [[ ! "${s3BucketName}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + s3PutObject="s3:PutObject" + bucketArn="arn:aws:s3:::${s3BucketName}" + resourceArn="" + if [[ ! "${s3KeyPrefix}" = "null" ]]; then + resourceArn="${bucketArn}/${s3KeyPrefix}*" + else + resourceArn="${bucketArn}/*" + fi + s3EvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${s3PutObject}" \ + --resource-arns "${resourceArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${s3EvalResult}" "${s3PutObject}" + if [[ "${s3Encryption}" = "true" ]]; then + s3GetEncryptionConfiguration="s3:GetEncryptionConfiguration" + s3EvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${s3GetEncryptionConfiguration}" \ + --resource-arns "${bucketArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${s3EvalResult}" "${s3GetEncryptionConfiguration}" + fi + fi + ### CloudWatch Logs + if [[ ! "${cloudWatchLogGroupName}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + # For Resource "*" + logsDescribeLogGroup="logs:DescribeLogGroups" + logsDescribeLogGroupEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${logsDescribeLogGroup}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${logsDescribeLogGroupEvalResult}" "${logsDescribeLogGroup}" + # For Resource "${cloudWatchLogGroupName}" + cwlogGroupArn="arn:aws:logs:${AWS_REGION}:${ACCOUNT_ID}:log-group:${cloudWatchLogGroupName}:*" + logsCreateLogStream="logs:CreateLogStream" + logsDescribeLogStreams="logs:DescribeLogStreams" + logsPutLogEvents="logs:PutLogEvents" + logsEvalResultsJson=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${logsCreateLogStream}" "${logsDescribeLogStreams}" "${logsPutLogEvents}" \ + --resource-arns "${cwlogGroupArn}" \ + --output json) + logsCreateLogStreamResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsCreateLogStream}") + showEvalResult "${logsCreateLogStreamResult}" "${logsCreateLogStream}" + logsDescribeLogStreamsResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsDescribeLogStreams}") + showEvalResult "${logsDescribeLogStreamsResult}" "${logsDescribeLogStreams}" + logsPutLogEventsResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsPutLogEvents}") + showEvalResult "${logsPutLogEventsResult}" "${logsPutLogEvents}" + fi +fi + +# 10. Check existing VPC Endpoints (PrivateLinks) in the task VPC. +# If there is any VPC Endpoints configured for the task VPC, we assume you would need an additional SSM PrivateLink to be configured. (yellow) +# TODO: In the ideal world, the script should simply check if the task can reach to the internet or not :) +requiredEndpoint="com.amazonaws.${AWS_REGION}.ssmmessages" +taskNetworkingAttachment=$(echo "${describedTaskJson}" | jq -r ".tasks[0].attachments[0]") +if [[ "${taskNetworkingAttachment}" = "null" ]]; then + ## bridge/host networking (only for EC2) + taskVpcId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].attributes[] | select(.name==\"ecs.vpc-id\") | .value") + taskSubnetId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].attributes[] | select(.name==\"ecs.subnet-id\") | .value") + subnetJson=$(${AWS_CLI_BIN} ec2 describe-subnets --subnet-ids "${taskSubnetId}") +else + ## awsvpc networking (for both EC2 and Fargate) + taskSubnetId=$(echo "${describedTaskJson}" | jq -r ".tasks[0].attachments[0].details[] | select(.name==\"subnetId\") | .value") + subnetJson=$(${AWS_CLI_BIN} ec2 describe-subnets --subnet-ids "${taskSubnetId}") + taskVpcId=$(echo "${subnetJson}" | jq -r ".Subnets[0].VpcId") +fi +## Obtain the ownerID of subnet's owner to check if the subnet is shared via AWS RAM (which check-ecs-exec.sh doesn't support today) +subnetOwnerId=$(echo "${subnetJson}" | jq -r ".Subnets[0].OwnerId") +printf "${COLOR_DEFAULT} VPC Endpoints | " +if [[ ! "${ACCOUNT_ID}" = "${subnetOwnerId}" ]]; then + ## Shared Subnets (VPC) are not supported in Amazon ECS Exec Checker + printf "${COLOR_RED}CHECK FAILED${COLOR_YELLOW}\n" + printf " Amazon ECS Exec Checker doesn't support VPC endpoint validation for AWS RAM shared VPC/subnets.\n" + printf " Check or contact your administrator to find if additional VPC endpoints are required by the following resources.\n" + printf " - Resources: ${taskVpcId} and ${taskSubnetId}\n" + printf " - VPC Endpoint: ${requiredEndpoint}${COLOR_DEFAULT}\n" +else + ## List Vpc Endpoints + vpcEndpointsJson=$(${AWS_CLI_BIN} ec2 describe-vpc-endpoints \ + --filters Name=vpc-id,Values="${taskVpcId}") + vpcEndpoints=$(echo "${vpcEndpointsJson}" | tr -d '\n' | jq -r ".VpcEndpoints[]") + if [[ "${vpcEndpoints}" = "" ]]; then + printf "${COLOR_GREEN}SKIPPED ${COLOR_DEFAULT}(${taskVpcId} - No additional VPC endpoints required)\n" + else + # Check whether an ssmmessages VPC endpoint exists + vpcEndpoints=$(echo "${vpcEndpointsJson}" | tr -d '\n' | jq -r ".VpcEndpoints[].ServiceName") + printf "\n" + ssmsessionVpcEndpointExists=false + for vpe in $vpcEndpoints; do + if [[ "${vpe}" = "${requiredEndpoint}" ]]; then + ssmsessionVpcEndpointExists=true + break + fi + done + + printf " Found existing endpoints for ${taskVpcId}:\n" + for vpe in $vpcEndpoints; do + if [[ "${vpe}" = "${requiredEndpoint}" ]]; then + printf " - ${COLOR_GREEN}${vpe}${COLOR_DEFAULT}\n" + else + printf " - ${COLOR_DEFAULT}${vpe}\n" + fi + done + if [[ "${ssmsessionVpcEndpointExists}" = "false" ]]; then + printf " SSM PrivateLink \"${COLOR_YELLOW}${requiredEndpoint}${COLOR_DEFAULT}\" not found. You must ensure your task has proper outbound internet connectivity." + fi + fi +fi + +# 11. Check task definition containers for environment variables AWS_ACCESS_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY +# if AWS_ACCESS_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY are defined in a container, they will be used by the SSM service +# if the key defined does not have requirement permissions, the execute-command will not work. +containerNameList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].name") +idx=0 +printf "${COLOR_DEFAULT} Environment Variables | (${taskDefFamily}:${taskDefRevision})\n" +for containerName in $containerNameList; do + printf " ${COLOR_DEFAULT}$((idx+1)). container \"${containerName}\"\n" + # find AWS_ACCESS_KEY + printf " ${COLOR_DEFAULT}- AWS_ACCESS_KEY" + AWS_ACCESS_KEY_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_ACCESS_KEY\") | .name") + case "${AWS_ACCESS_KEY_FOUND}" in + *AWS_ACCESS_KEY* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + # find AWS_ACCESS_KEY_ID + printf " ${COLOR_DEFAULT}- AWS_ACCESS_KEY_ID" + AWS_ACCESS_KEY_ID_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_ACCESS_KEY_ID\") | .name") + case "${AWS_ACCESS_KEY_ID_FOUND}" in + *AWS_ACCESS_KEY_ID* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + # find AWS_SECRET_ACCESS_KEY + printf " ${COLOR_DEFAULT}- AWS_SECRET_ACCESS_KEY" + AWS_SECRET_ACCESS_KEY_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_SECRET_ACCESS_KEY\") | .name") + case "${AWS_SECRET_ACCESS_KEY_FOUND}" in + *AWS_SECRET_ACCESS_KEY* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + idx=$((idx+1)) +done + +printf "\n" From 5d6add2058998d6a101590e8e8f106a1072fa714 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Fri, 1 Nov 2024 09:29:51 -0700 Subject: [PATCH 3/6] Updated README.md Updated instructions and successfully tested the code. --- neuron-problem-detector/ecs-npd-cdk/README.md | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/neuron-problem-detector/ecs-npd-cdk/README.md b/neuron-problem-detector/ecs-npd-cdk/README.md index 702392f..b7b4439 100644 --- a/neuron-problem-detector/ecs-npd-cdk/README.md +++ b/neuron-problem-detector/ecs-npd-cdk/README.md @@ -1,9 +1,12 @@ +# Overview -# Welcome to your CDK Python project! +This project contains CDK code to provision : -This is a blank project for CDK development with Python. +* An ECS Cluster and one Inf2.xlarge EC2 instance joining the cluster. +* An ECS Task Definition for Neruon Problem Detector and Recovery +* An ECS Service that run the containers as Daemon in all instances +* Related IAM roles and log groups -The `cdk.json` file tells the CDK Toolkit how to execute your app. This project is set up like a standard Python project. The initialization process also creates a virtualenv within this project, stored under the `.venv` @@ -12,9 +15,15 @@ directory. To create the virtualenv it assumes that there is a `python3` package. If for any reason the automatic creation of the virtualenv fails, you can create the virtualenv manually. +The `cdk.json` file tells the CDK Toolkit how to execute your app. + ## Pre-requisites -You will need `python3` and `cdk` utility installed on your machine. -To install `cdk` follow the instructions [here](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) +Before you start, ensure that you have installed the latest version of the following tools on your machine: + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) +2. [aws cdk](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) +3. [Session Manager Plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html) + ## Environment Setup To manually create a virtualenv on MacOS and Linux: @@ -60,7 +69,7 @@ Deploy the stack in your AWS environment cdk deploy [--profile ] ``` - +## Optional To add additional dependencies, for example other CDK libraries, just add them to your `setup.py` file and rerun the `pip install -r requirements.txt` command. @@ -73,4 +82,4 @@ command. * `cdk diff` compare deployed stack with current state * `cdk docs` open CDK documentation -Enjoy! + From a7c38b9044b91a304fa309834ceac2de5213ab90 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Wed, 4 Dec 2024 09:18:38 -0800 Subject: [PATCH 4/6] Incorporated code review feedback --- .../ecs-npd-cdk/.gitignore | 1 + neuron-problem-detector/ecs-npd-cdk/README.md | 2 +- .../ecs-npd-cdk/neuron.yaml | 142 +++++++++--------- .../__pycache__/__init__.cpython-311.pyc | Bin 196 -> 0 bytes ...ron_problem_detector_stack.cpython-311.pyc | Bin 7409 -> 0 bytes .../ecs_task_definition.json | 11 +- .../neuron_problem_detector_stack.py | 54 ++++--- 7 files changed, 106 insertions(+), 104 deletions(-) create mode 100644 neuron-problem-detector/ecs-npd-cdk/.gitignore delete mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc delete mode 100644 neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc diff --git a/neuron-problem-detector/ecs-npd-cdk/.gitignore b/neuron-problem-detector/ecs-npd-cdk/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/neuron-problem-detector/ecs-npd-cdk/README.md b/neuron-problem-detector/ecs-npd-cdk/README.md index b7b4439..d13019e 100644 --- a/neuron-problem-detector/ecs-npd-cdk/README.md +++ b/neuron-problem-detector/ecs-npd-cdk/README.md @@ -3,7 +3,7 @@ This project contains CDK code to provision : * An ECS Cluster and one Inf2.xlarge EC2 instance joining the cluster. -* An ECS Task Definition for Neruon Problem Detector and Recovery +* An ECS Task Definition for Neuron Problem Detector and Recovery * An ECS Service that run the containers as Daemon in all instances * Related IAM roles and log groups diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml index ec64fb8..29b92a6 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml +++ b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml @@ -365,22 +365,22 @@ Resources: Type: AWS::ECS::Cluster Metadata: aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceSecurityGroupC637EF03: + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceSecurityGroupC637EF03: Type: AWS::EC2::SecurityGroup Properties: - GroupDescription: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceSecurityGroup + GroupDescription: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceSecurityGroup SecurityGroupEgress: - CidrIp: 0.0.0.0/0 Description: Allow all outbound traffic by default IpProtocol: "-1" Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity VpcId: Ref: NeuronProblemDetectorVPC5F617726 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceSecurityGroup/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceSecurityGroup/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: @@ -398,10 +398,10 @@ Resources: - :iam::aws:policy/AmazonSSMManagedInstanceCore Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceRole/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceRole/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48: Type: AWS::IAM::Policy Properties: PolicyDocument: @@ -434,26 +434,26 @@ Resources: Effect: Allow Resource: "*" Version: "2012-10-17" - PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 Roles: - - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceRole/DefaultPolicy/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceProfile11E4E5E2: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceProfile11E4E5E2: Type: AWS::IAM::InstanceProfile Properties: Roles: - - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/InstanceProfile - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceProfile + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126: Type: AWS::EC2::LaunchTemplate Properties: LaunchTemplateData: IamInstanceProfile: Arn: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceProfile11E4E5E2 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceProfile11E4E5E2 - Arn ImageId: Ref: SsmParameterValueawsserviceecsoptimizedamiamazonlinux2infrecommendedimageidC96584B6F00A464EAD1953AFF4B05118Parameter @@ -462,17 +462,17 @@ Resources: Enabled: false SecurityGroupIds: - Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceSecurityGroupC637EF03 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceSecurityGroupC637EF03 - GroupId TagSpecifications: - ResourceType: instance Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate - ResourceType: volume Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate UserData: Fn::Base64: Fn::Join: @@ -486,29 +486,29 @@ Resources: - ResourceType: launch-template Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate DependsOn: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityInstanceRole4CDFA2E5 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LaunchTemplate/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF: Type: AWS::AutoScaling::AutoScalingGroup Properties: DesiredCapacity: "1" LaunchTemplate: LaunchTemplateId: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126 + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126 Version: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLaunchTemplateF1F92126 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126 - LatestVersionNumber MaxSize: "3" MinSize: "1" Tags: - Key: Name PropagateAtLaunch: true - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity VPCZoneIdentifier: - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 @@ -518,8 +518,8 @@ Resources: AutoScalingScheduledAction: IgnoreUnmodifiedGroupSizeProperties: true Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/ASG - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/ASG + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: @@ -537,10 +537,10 @@ Resources: - :iam::aws:policy/service-role/AWSLambdaBasicExecutionRole Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7: Type: AWS::IAM::Policy Properties: PolicyDocument: @@ -564,7 +564,7 @@ Resources: - ":" - Ref: AWS::AccountId - :autoScalingGroup:*:autoScalingGroupName/ - - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF - Action: - ecs:DescribeContainerInstances - ecs:DescribeTasks @@ -588,12 +588,12 @@ Resources: - NeuronProblemDetectorClusterED21CFD2 - Arn Version: "2012-10-17" - PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 Roles: - - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/DefaultPolicy/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D: Type: AWS::Lambda::Function Properties: Code: @@ -695,52 +695,52 @@ Resources: Handler: index.lambda_handler Role: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 - Arn Runtime: python3.9 Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity Timeout: 310 DependsOn: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionAllowInvokeNeuronProblemDetectorStackNeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A8A7A5064: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionAllowInvokeNeuronProblemDetectorStackNeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A8A7A5064: Type: AWS::Lambda::Permission Properties: Action: lambda:InvokeFunction FunctionName: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D - Arn Principal: sns.amazonaws.com SourceArn: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/AllowInvoke:NeuronProblemDetectorStackNeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunctionTopicBAF651D7: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/AllowInvoke:NeuronProblemDetectorStackNeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionTopicBAF651D7: Type: AWS::SNS::Subscription Properties: Endpoint: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D - Arn Protocol: lambda TopicArn: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/DrainECSHook/Function/Topic/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/Topic/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430: Type: AWS::SNS::Topic Properties: Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Topic/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Topic/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: @@ -752,10 +752,10 @@ Resources: Version: "2012-10-17" Tags: - Key: Name - Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756: Type: AWS::IAM::Policy Properties: PolicyDocument: @@ -763,32 +763,32 @@ Resources: - Action: sns:Publish Effect: Allow Resource: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 Version: "2012-10-17" - PolicyName: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 Roles: - - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/DefaultPolicy/Resource - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookC7D53AF2: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookC7D53AF2: Type: AWS::AutoScaling::LifecycleHook Properties: AutoScalingGroupName: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityASGDE9EB8FF + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF DefaultResult: CONTINUE HeartbeatTimeout: 300 LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING NotificationTargetARN: - Ref: NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 RoleARN: Fn::GetAtt: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 - Arn DependsOn: - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 - - NeuronProblemDetectorClusterNeruonAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeruonAutoScalingGroupCapacity/LifecycleHookDrainHook/Resource + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Resource NeuronProblemDetectorTaskExecutionRole563D2650: Type: AWS::IAM::Role Properties: @@ -880,12 +880,12 @@ Resources: Properties: ContainerDefinitions: - Command: - - echo '{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HANG_ON_COLLECTIVES","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json + - echo '{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json EntryPoint: - /bin/sh - -c Essential: true - Image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19 + Image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.20 LinuxParameters: Capabilities: {} Devices: @@ -919,7 +919,7 @@ Resources: - Name: ENABLE_RECOVERY Value: "true" Essential: true - Image: public.ecr.aws/neuron/neuron-node-recovery:1.2.0 + Image: public.ecr.aws/neuron/neuron-node-recovery:1.3.0 LogConfiguration: LogDriver: awslogs Options: diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 6be5a9d166e3c7a55c95865450f4b1c782e569a8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 196 zcmZ3^%ge<81Zu%b=^*+sh=2h`DC095kTIPhg&~+hlhJP_LlF~@{~09tD_lRcIJKx) zKQXa5BQ>))Lq9LIv?xC>zMv>SDJL~IJ|(pzHMu0eNIx~XSU0aAMK?Jm8>gE1_{_Y_ rlK6PNg34bUHo5sJr8%i~MXW%3KrSif2NEBc85tQrFu;f+W}p}V1z0m7 diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__pycache__/neuron_problem_detector_stack.cpython-311.pyc deleted file mode 100644 index b151bd27e64c558a71073269052d786e62153489..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7409 zcmb_hO>7&-6<+=?e?|RMltleGmT8NmtjMvg)@fymj*Z%sAUScXAla^VN7PE(U3Pcr zNUQ|(P@pQ1q&>Ju3izNyp`@@5x%A#+kF-I+iva`#D01+PLS&%msc&|bH1$e+ zJ3nt`-n{qby^q5$!r=e~&)KEX0vD&Kf0Byzcp8Oo^HBJV;wj!yq(sNELv${?ME9~s z^e+2Edf6`qmV*w`?<|JI@N(EeIjMIj-t}9GcO%EXi=rOE$9$F}yl06@dY_SM(u+My z8p~~9AN!HYAa2SksAfT6MNE^fqFhjM8!546O;$O!C`g5*1N*bGq-si;)40Y0#>?BN zd^-V!-+}OmT6XZ%vXgfp7w<%F-o?9Dy}Q(^gZKP4wCvfXHXX}eA{dV6QCX3t1w~#d zB5@9Bh|^@n6w{+}atYxOXRXQzJoYL`D=Si-Yi<%2ZHG?~KHqNoDC$p+&p;)b1L}=u zAKs6w3bo$s*>-F@SDga2?fQ%3Q-_1nEF%E#dUO75Fa_^CaM@M6ma}R?6tI7Z4_nWZLYs=xE0~rhrK;&?vMBJG*8=h zMYLo*y#FQM03SFA?!&5&nk$1w`Ye9XNZ2aF9z(XCh}FY~?Xzkzs|UunS>?T`Rcd|n z5&PJJDckxR)xT8KKu|rWSg(w>RBJKN~iP> zTKxs*!LZ$Tb^^B55*8=yab({Ka>2)Zc7of%y&c$99EehwMjT) zweO8urPen;W}lZo!@mJ1}NCn{#|pyJ^*-@2pi`AGb;e z=k4}CxM0`7cJN?gA0pnPX4KmWZHM;GS>s#ZHmxSD9zMmV`Ls<3=dHf=3s%Y2Pm+}J ztf#EfL1rJ~_oHSEw`=JY+Ml$|{NJ>H(LQfO`%|F(H~DGdhuGi}|JMI&gM-U2Ss-lF zblTDpSp3SiXWP${JaKg&a@nJ1KHG`#6lC+6JdT~1&fi0bqsZk_mMyWIplv-PIry*W zcat=B0#pc#EVm{|h!I4#062my%{7KXO^{yHqC8Rs1@SLh!U8}YR^kw&Z2<`BRaKEu zk*W%^#FUUC3Pw@IV;l?f$dbmwN(vw*jsq^HIa-m62%kN1vv*l_Be#jTvIdhb0$-&U zacW9~W;M->}9yziD z75f#W%4G!^*@_nShwiR*&aho6kU`5UC;(l|Me2Jfx_{9b{(cK(ct@KI; z0!>LJKE+CWb2jY0I5{#xAZ@#sLwDva-d9 z!U_(9wMwietO!MLP#i&)eHp_M{KU9MvEtA0=zeBHR12U&|C+373=CYuk)|`5^oDySd;Hs4BbTv5}4oG~<0GGTR!Z{X^aTW9W1rmv(g zUaj~tD}t0!*DB5w$KC+gg}60&c>LF+rDC}tNTV~OMBLGdQTT$e0J1_ua}!3Zys-k` zFQEC72=$dR_<*t`RM6KU0NRd!6CCZWjU>`VP%e-Lt(CEzjzaAwg4%keq znLY_j3Sw262+?R~)x4Zj6k`Gr&K$USvvZAAB^eo=J0HN%auKPmQxl!C61Z6F;?2>o zSkhRKSz4UE!@Qf%&Mz+JvUg{%-_9|)#l`tWmQE$Yg0hsIK$U` zjfVpRH6*d{T|5PGB!xChvI6dWZIvym2>Q%r(&`puX_1Vx4)L{DNPu0C)^L!7F1cVN z@Cu!QrVIowNL3M-hAEJ|&PWi`NEXrHWI{%P&M?F@#EMKwL94jU!#)@V@vDHNkQd6E5Ld0>SqXp)sVkEyjVnD{AP~ia z-G&scVHk8^MQBo_A*^aLCl_&uEtNpDMvb--QDw4JUV&7bMx2s{6rM3tOyiZZC9Ro@ z>8bQ&B~jYa*1%XK*OoLj!>u81BeiDa<8)~Yx8?Fin1OgR|6XqK7uXH=FbGGjY}V9l z*wCycNz$wQ1a_tTl#p2<7_@a?fEyQ*UvUg{%ga)63sRD-F{_|2)3i#K@Q#>~77W*t zH|JQ$cG8lGN0M&PxQW?tn~~diuoHxJupajE1}p9{J&G}NAp*2ym?)XS9<>bd4$}}Wv|Bu*%?MI63k!X^__;ek~}78!CW2LAN| zPHmY^8!@@&WDZalXtvF)!$p<}Ww1AA03z&}y?uNB9*#8Nho!4v-h>r*2&(YS-dkb- zG~;mAh9+FO>=gFSA%X;P)O;^Au_$aLH#>(t_ZDy6&Ed9}VM36o5FiJPRdLwRr-fQ^ zBx@BL+BEiRK?)q2{YV7|n}ZVt)^fvXQZa3L91bs9=XH+Vf{yuoj=7i1{S3F0MMwYv zjg?Ap{vZe}K=qDU!!+FbndiuLH@kp?v!znA#l5C!B^=0cQEI}%&796u?*m6ZUcgbat}nPjG2cRH2a+!*0go*h^#6BeE<3lrPHxb zJiqsS?5okS&%5_RRr)ggf8{mbK);E!e_Z$@UmbYs?~ZEZ`s2v;TIBk!>zkgw2OE25 zs^@O}BU|lRc-*s4>siSxaBlhgHX%0F^M1^t0zLh6J(AE1UtAFk=4 zs2-X=?2XrYFH|RTk9%*_dT)F~Rv392efvrDZM`cC>$G>kI_({Gmpc)u`~5?KBMQhd zAyvBnc@NB0OH3N$#=eeS)??S}9@k(`ou-DzK1YZBiH9?_{tNZ6uf5}l0&+~q^C;Ec z1zXVvr>g^(^r6ufPM@1_0y!q+dBEblTphUbD$e>~XDm{8brT7J91}t$OjG_)jqd+C zn$e>(AX|3?WQ+Cx5gm5+Jea96NJ-BJ;=)y}n(g!E1>DlVQ z^}65F4FiE36SC`vZS^N=@d;yCQtzBN>>GGERO_46qdj`>pkbg3U`@S0t@jMoI|KbN z9LO;tyKT?oRDAGZT<;mt`_IDAT3@QxH}$t{ZTe=*M06di~s_Ctz=)}6$sIt`y1+ncM#l65!L{(gJip^cAW&?6e$dQXC76Vc@ zNEekzJsq5RGB~4$dhC0-CQS?_|Xy7 zkpP*1RO$Hhb5w8qfn?YwqjwGI0~hqpUVUsF9Ilr*TraQ!*$tAvZy}se@SPd*I>Azq z*8>dsz6{^Opd4bDUzORS*`ttaTI>RpXGAL;ZUj&c6^jg$bSR&Y&_u3yeyV&5R3SfL z7=+wCs|*l`S=L|O{h`wn_*|${o>zJC~1{$Q<`PqvvqltEBd$kmeq5tCAW zP6)X>G%$-oZp9Rq5OS-jY!O0!QNh8MTNNAtByIkbrWz_Rw)9uZT#Rh=-}H}7jlHe> z8LC7B>K8!jE{DTWclsU9I`#TU7v&hKQp2tPb?+~o4)Dj5>0bAd(Q|xCzp;`307XvW A&;S4c diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json index ee4eeab..c6175da 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json @@ -3,7 +3,7 @@ "containerDefinitions": [ { "name": "npd", - "image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19", + "image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.20", "cpu": 0, "portMappings": [ { @@ -20,7 +20,7 @@ "-c" ], "command": [ - "echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HANG_ON_COLLECTIVES\",\"pattern\":\".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json" + "echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json" ], "environment": [], "mountPoints": [], @@ -52,7 +52,7 @@ }, { "name": "recovery", - "image": "public.ecr.aws/neuron/neuron-node-recovery:1.1.0", + "image": "public.ecr.aws/neuron/neuron-node-recovery:1.3.0", "cpu": 0, "portMappings": [], "essential": true, @@ -84,9 +84,6 @@ "systemControls": [] } ], - "executionRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole", - "taskRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole", - "networkMode": "awsvpc", "requiresCompatibilities": [ "EC2" ], @@ -96,4 +93,4 @@ "cpuArchitecture": "X86_64", "operatingSystemFamily": "LINUX" } -} \ No newline at end of file +} diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py index 5d46de8..1b3bd77 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py @@ -9,19 +9,23 @@ aws_autoscaling as autoscaling, ) from constructs import Construct +import json + class NeuronProblemDetectorStack(Stack): def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) - + with open('ecs_task_definition.json', 'r') as f: + ecs_task_definition = json.load(f) + vpc = ec2.Vpc(self, "NeuronProblemDetectorVPC", max_azs=2) ecs_cluster = ecs.Cluster(self, "NeuronProblemDetectorCluster", vpc=vpc) ecs_cluster.add_capacity( - id="NeruonAutoScalingGroupCapacity", + id="NeuronAutoScalingGroupCapacity", machine_image=ecs.EcsOptimizedImage.amazon_linux2( ecs.AmiHardwareType.NEURON ), @@ -91,8 +95,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "NeuronNpdAndRecoveryTaskDef", family="neuron-npd-and-recovery", network_mode=ecs.NetworkMode.AWS_VPC, - cpu="1024", - memory_mib="3072", + cpu=ecs_task_definition["cpu"], + memory_mib=ecs_task_definition["memory"], compatibility=ecs.Compatibility.EC2, execution_role=task_execution_role, task_role=task_role @@ -100,8 +104,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: # Create the device mapping device_mapping = ecs.Device( - host_path="/dev/kmsg", - container_path="/dev/kmsg", + host_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["hostPath"], + container_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["containerPath"], permissions=[ecs.DevicePermission.READ, ecs.DevicePermission.WRITE], ) @@ -113,21 +117,19 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: linux_parameters.add_devices(device_mapping) npd_container = task_definition.add_container( - "npd", + ecs_task_definition["containerDefinitions"][0]["name"], image=ecs.ContainerImage.from_registry( - "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19" + ecs_task_definition["containerDefinitions"][0]["image"] ), - entry_point=["/bin/sh", "-c"], - command=[ - 'echo \'{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HANG_ON_COLLECTIVES","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}\' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json' - ], + entry_point=ecs_task_definition["containerDefinitions"][0]["entrypoint"], + command=ecs_task_definition["containerDefinitions"][0]["command"], privileged=True, logging=ecs.AwsLogDriver( - stream_prefix="ecs", + stream_prefix=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-stream-prefix"], log_group=logs.LogGroup( self, "NpdLogGroup", - log_group_name="/ecs/npd", + log_group_name=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-group"], retention=logs.RetentionDays.ONE_WEEK, ), ), @@ -136,29 +138,31 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: npd_container.add_port_mappings( ecs.PortMapping( - name="npd-80-tcp", - container_port=80, - host_port=80, + name=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["name"], + container_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["containerPort"], + host_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["hostPort"], protocol=ecs.Protocol.TCP, app_protocol=ecs.AppProtocol.http, ) ) recovery_container = task_definition.add_container( - "recovery", + ecs_task_definition["containerDefinitions"][1]["name"], image=ecs.ContainerImage.from_registry( - "public.ecr.aws/neuron/neuron-node-recovery:1.2.0" + ecs_task_definition["containerDefinitions"][1]["image"] ), - entry_point=["/bin/sh", "-c"], - command=["python scripts/check-health.py"], - environment={"ENABLE_RECOVERY": "true"}, - readonly_root_filesystem=True, + entry_point=ecs_task_definition["containerDefinitions"][1]["entryPoint"], + command=ecs_task_definition["containerDefinitions"][1]["command"], + environment={ + ecs_task_definition["containerDefinitions"][1]["environment"][0]["name"]: ecs_task_definition["containerDefinitions"][1]["environment"][0]["value"] + }, + readonly_root_filesystem=ecs_task_definition["containerDefinitions"][1]["readonlyRootFilesystem"], logging=ecs.AwsLogDriver( - stream_prefix="ecs", + stream_prefix=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-stream-prefix"], log_group=logs.LogGroup( self, "RecoveryLogGroup", - log_group_name="/ecs/recovery", + log_group_name=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-group"], retention=logs.RetentionDays.ONE_WEEK, ), ), From 56e127886e368b4309d0a84fe45ce4f845dde988 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Wed, 4 Dec 2024 11:10:49 -0800 Subject: [PATCH 5/6] removed duplicate container defintion from neuron.yaml --- neuron-problem-detector/ecs-npd-cdk/.gitignore | 1 + neuron-problem-detector/ecs-npd-cdk/README.md | 17 ++++++++++++++++- neuron-problem-detector/ecs-npd-cdk/neuron.yaml | 3 --- .../ecs_task_definition.json | 15 ++------------- .../neuron_problem_detector_stack.py | 4 ++-- .../ecs-npd-cdk/requirements.txt | 2 +- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/neuron-problem-detector/ecs-npd-cdk/.gitignore b/neuron-problem-detector/ecs-npd-cdk/.gitignore index c18dd8d..7039d49 100644 --- a/neuron-problem-detector/ecs-npd-cdk/.gitignore +++ b/neuron-problem-detector/ecs-npd-cdk/.gitignore @@ -1 +1,2 @@ __pycache__/ +cdk.out/ diff --git a/neuron-problem-detector/ecs-npd-cdk/README.md b/neuron-problem-detector/ecs-npd-cdk/README.md index d13019e..3d9ca66 100644 --- a/neuron-problem-detector/ecs-npd-cdk/README.md +++ b/neuron-problem-detector/ecs-npd-cdk/README.md @@ -52,12 +52,12 @@ $ pip install -r requirements.txt ``` ## Synthesize CloudFormation template +It is assumed that you have authenticated successfully to connect to your AWS environment. At this point you can now synthesize the CloudFormation template for this code. ``` $ cdk synth ``` -It is assumed that you have authenticated successfully to connect to your AWS environment. Perform bootstrap function with the following command. ``` @@ -69,6 +69,21 @@ Deploy the stack in your AWS environment cdk deploy [--profile ] ``` +## Cleanup Instructions + +Destroy the stack in your AWS environment + +``` +cdk destroy [--profile ] +``` + +Delete the following log groups in cloudwatch + +``` +/ecs/recovery +/ecs/npd +``` + ## Optional To add additional dependencies, for example other CDK libraries, just add them to your `setup.py` file and rerun the `pip install -r requirements.txt` diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml index 29b92a6..75fd7c0 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml +++ b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml @@ -875,7 +875,6 @@ Resources: - Ref: NeuronProblemDetectorTaskRole673752FB Metadata: aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/DefaultPolicy/Resource - NeuronNpdAndRecoveryTaskDef7591F251: Type: AWS::ECS::TaskDefinition Properties: ContainerDefinitions: @@ -986,8 +985,6 @@ Resources: - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 SchedulingStrategy: DAEMON - TaskDefinition: - Ref: NeuronNpdAndRecoveryTaskDef7591F251 DependsOn: - NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 - NeuronProblemDetectorTaskRole673752FB diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json index c6175da..a6a52cc 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json @@ -9,9 +9,7 @@ { "name": "npd-80-tcp", "containerPort": 80, - "hostPort": 80, - "protocol": "tcp", - "appProtocol": "http" + "hostPort": 80 } ], "essential": true, @@ -43,7 +41,6 @@ "options": { "awslogs-group": "/ecs/npd", "awslogs-create-group": "true", - "awslogs-region": "us-west-2", "awslogs-stream-prefix": "ecs" }, "secretOptions": [] @@ -77,20 +74,12 @@ "options": { "awslogs-create-group": "true", "awslogs-group": "/ecs/recovery", - "awslogs-region": "us-west-2", "awslogs-stream-prefix": "ecs" } }, "systemControls": [] } ], - "requiresCompatibilities": [ - "EC2" - ], "cpu": "1024", - "memory": "3072", - "runtimePlatform": { - "cpuArchitecture": "X86_64", - "operatingSystemFamily": "LINUX" - } + "memory": "3072" } diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py index 1b3bd77..09a1995 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py @@ -17,7 +17,7 @@ class NeuronProblemDetectorStack(Stack): def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) - with open('ecs_task_definition.json', 'r') as f: + with open('neuron_problem_detector/ecs_task_definition.json', 'r') as f: ecs_task_definition = json.load(f) vpc = ec2.Vpc(self, "NeuronProblemDetectorVPC", max_azs=2) @@ -121,7 +121,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: image=ecs.ContainerImage.from_registry( ecs_task_definition["containerDefinitions"][0]["image"] ), - entry_point=ecs_task_definition["containerDefinitions"][0]["entrypoint"], + entry_point=ecs_task_definition["containerDefinitions"][0]["entryPoint"], command=ecs_task_definition["containerDefinitions"][0]["command"], privileged=True, logging=ecs.AwsLogDriver( diff --git a/neuron-problem-detector/ecs-npd-cdk/requirements.txt b/neuron-problem-detector/ecs-npd-cdk/requirements.txt index d5307a6..54af265 100644 --- a/neuron-problem-detector/ecs-npd-cdk/requirements.txt +++ b/neuron-problem-detector/ecs-npd-cdk/requirements.txt @@ -1,2 +1,2 @@ -aws-cdk-lib==2.152.0 +aws-cdk-lib>=2.152.0 constructs>=10.0.0,<11.0.0 From 8281eeb1b0b6165a5539e37912afcd123b983c9b Mon Sep 17 00:00:00 2001 From: Ashish Kumar Date: Wed, 4 Dec 2024 17:08:33 -0800 Subject: [PATCH 6/6] removed redundant code --- .../ecs-npd-cdk/neuron.yaml | 70 ------------------- 1 file changed, 70 deletions(-) diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml index 75fd7c0..fe40b03 100644 --- a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml +++ b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml @@ -875,76 +875,6 @@ Resources: - Ref: NeuronProblemDetectorTaskRole673752FB Metadata: aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/DefaultPolicy/Resource - Type: AWS::ECS::TaskDefinition - Properties: - ContainerDefinitions: - - Command: - - echo '{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json - EntryPoint: - - /bin/sh - - -c - Essential: true - Image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.20 - LinuxParameters: - Capabilities: {} - Devices: - - ContainerPath: /dev/kmsg - HostPath: /dev/kmsg - Permissions: - - read - - write - LogConfiguration: - LogDriver: awslogs - Options: - awslogs-group: - Ref: NpdLogGroup39A02E3D - awslogs-stream-prefix: ecs - awslogs-region: - Ref: AWS::Region - Name: npd - PortMappings: - - AppProtocol: http - ContainerPort: 80 - HostPort: 80 - Name: npd-80-tcp - Protocol: tcp - Privileged: true - - Command: - - python scripts/check-health.py - EntryPoint: - - /bin/sh - - -c - Environment: - - Name: ENABLE_RECOVERY - Value: "true" - Essential: true - Image: public.ecr.aws/neuron/neuron-node-recovery:1.3.0 - LogConfiguration: - LogDriver: awslogs - Options: - awslogs-group: - Ref: RecoveryLogGroupF6D50671 - awslogs-stream-prefix: ecs - awslogs-region: - Ref: AWS::Region - Name: recovery - ReadonlyRootFilesystem: true - Cpu: "1024" - ExecutionRoleArn: - Fn::GetAtt: - - NeuronProblemDetectorTaskExecutionRole563D2650 - - Arn - Family: neuron-npd-and-recovery - Memory: "3072" - NetworkMode: awsvpc - RequiresCompatibilities: - - EC2 - TaskRoleArn: - Fn::GetAtt: - - NeuronProblemDetectorTaskRole673752FB - - Arn - Metadata: - aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryTaskDef/Resource NpdLogGroup39A02E3D: Type: AWS::Logs::LogGroup Properties: