diff --git a/goss.yaml b/goss.yaml index ab9545768..c5c2165d6 100644 --- a/goss.yaml +++ b/goss.yaml @@ -44,10 +44,6 @@ service: enabled: true running: true - lifecycled: - enabled: true - running: true - sshd: enabled: true running: true @@ -93,9 +89,6 @@ process: buildkite-agent: running: true - lifecycled: - running: true - sshd: running: true diff --git a/packer/linux/buildkite-ami.json b/packer/linux/buildkite-ami.json index 8a262784b..da485b03e 100644 --- a/packer/linux/buildkite-ami.json +++ b/packer/linux/buildkite-ami.json @@ -45,10 +45,6 @@ "type": "shell", "script": "scripts/install-cloudwatch-agent.sh" }, - { - "type": "shell", - "script": "scripts/install-lifecycled.sh" - }, { "type": "shell", "script": "scripts/install-docker.sh" diff --git a/packer/linux/conf/bin/bk-install-elastic-stack.sh b/packer/linux/conf/bin/bk-install-elastic-stack.sh index 67357aa46..a95a1c391 100755 --- a/packer/linux/conf/bin/bk-install-elastic-stack.sh +++ b/packer/linux/conf/bin/bk-install-elastic-stack.sh @@ -176,7 +176,6 @@ experiment="${BUILDKITE_AGENT_EXPERIMENTS}" priority=%n spawn=${BUILDKITE_AGENTS_PER_INSTANCE} no-color=true -disconnect-after-idle-timeout=${BUILDKITE_SCALE_IN_IDLE_PERIOD} disconnect-after-job=${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB} EOF @@ -203,15 +202,6 @@ if [[ -n "${BUILDKITE_ELASTIC_BOOTSTRAP_SCRIPT}" ]] ; then rm /tmp/elastic_bootstrap fi -cat << EOF > /etc/lifecycled -AWS_REGION=${AWS_REGION} -LIFECYCLED_HANDLER=/usr/local/bin/stop-agent-gracefully -LIFECYCLED_CLOUDWATCH_GROUP=/buildkite/lifecycled -EOF - -systemctl enable lifecycled.service -systemctl start lifecycled - # wait for docker to start next_wait_time=0 until docker ps || [ $next_wait_time -eq 5 ]; do @@ -224,7 +214,6 @@ if ! docker ps ; then fi systemctl enable "buildkite-agent" -systemctl start "buildkite-agent" # let the stack know that this host has been initialized successfully /opt/aws/bin/cfn-signal \ diff --git a/packer/linux/conf/buildkite-agent/systemd/buildkite-agent.service b/packer/linux/conf/buildkite-agent/systemd/buildkite-agent.service index 3fbd9a276..510e538a6 100644 --- a/packer/linux/conf/buildkite-agent/systemd/buildkite-agent.service +++ b/packer/linux/conf/buildkite-agent/systemd/buildkite-agent.service @@ -12,12 +12,11 @@ Environment="HOME=/var/lib/buildkite-agent" Environment="PATH=/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin" Environment="USER=buildkite-agent" ExecStart=/usr/bin/buildkite-agent start -ExecStopPost=/usr/local/bin/terminate-instance RestartSec=5 -Restart=on-failure +Restart=always RestartForceExitStatus=SIGPIPE TimeoutStartSec=10 -TimeoutStopSec=0 +TimeoutStopSec=3600 KillMode=process [Install] diff --git a/packer/linux/scripts/install-lifecycled.sh b/packer/linux/scripts/install-lifecycled.sh deleted file mode 100755 index 890334e6c..000000000 --- a/packer/linux/scripts/install-lifecycled.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -eu -o pipefail - -LIFECYCLED_VERSION=v3.2.0 - -MACHINE=$(uname -m) - -case "${MACHINE}" in - x86_64) ARCH=amd64;; - aarch64) ARCH=arm64;; - *) ARCH=unknown;; -esac - -echo "Installing lifecycled ${LIFECYCLED_VERSION}..." - -sudo touch /etc/lifecycled -sudo curl -Lf -o /usr/bin/lifecycled \ - https://github.com/buildkite/lifecycled/releases/download/${LIFECYCLED_VERSION}/lifecycled-linux-${ARCH} -sudo chmod +x /usr/bin/lifecycled -sudo curl -Lf -o /etc/systemd/system/lifecycled.service \ - https://raw.githubusercontent.com/buildkite/lifecycled/${LIFECYCLED_VERSION}/init/systemd/lifecycled.unit - diff --git a/packer/windows/buildkite-ami.json b/packer/windows/buildkite-ami.json index 4c74dbba4..ca480d1eb 100644 --- a/packer/windows/buildkite-ami.json +++ b/packer/windows/buildkite-ami.json @@ -44,10 +44,6 @@ "type": "powershell", "script": "scripts/install-cloudwatch-agent.ps1" }, - { - "type": "powershell", - "script": "scripts/install-lifecycled.ps1" - }, { "type": "powershell", "script": "scripts/install-docker.ps1" diff --git a/packer/windows/conf/bin/bk-install-elastic-stack.ps1 b/packer/windows/conf/bin/bk-install-elastic-stack.ps1 index b536f1eca..140a0523f 100755 --- a/packer/windows/conf/bin/bk-install-elastic-stack.ps1 +++ b/packer/windows/conf/bin/bk-install-elastic-stack.ps1 @@ -135,15 +135,10 @@ priority=%n spawn=${Env:BUILDKITE_AGENTS_PER_INSTANCE} no-color=true shell=powershell -disconnect-after-idle-timeout=${Env:BUILDKITE_SCALE_IN_IDLE_PERIOD} disconnect-after-job=${Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB} "@ $OFS=" " -nssm set lifecycled AppEnvironmentExtra :AWS_REGION=$Env:AWS_REGION -nssm set lifecycled AppEnvironmentExtra +LIFECYCLED_HANDLER="C:\buildkite-agent\bin\stop-agent-gracefully.ps1" -Restart-Service lifecycled - # wait for docker to start $next_wait_time=0 do { @@ -212,13 +207,9 @@ nssm set buildkite-agent AppEnvironmentExtra :HOME=C:\buildkite-agent If ($lastexitcode -ne 0) { Exit $lastexitcode } nssm set buildkite-agent AppExit Default Restart If ($lastexitcode -ne 0) { Exit $lastexitcode } -nssm set buildkite-agent AppRestartDelay 10000 -If ($lastexitcode -ne 0) { Exit $lastexitcode } -nssm set buildkite-agent AppEvents Exit/Post "powershell C:\buildkite-agent\bin\terminate-instance.ps1" +nssm set buildkite-agent AppRestartDelay 5000 If ($lastexitcode -ne 0) { Exit $lastexitcode } -Restart-Service buildkite-agent - # renable debug tracing Set-PSDebug -Trace 2 diff --git a/packer/windows/scripts/install-lifecycled.ps1 b/packer/windows/scripts/install-lifecycled.ps1 deleted file mode 100755 index f87adbbb6..000000000 --- a/packer/windows/scripts/install-lifecycled.ps1 +++ /dev/null @@ -1,19 +0,0 @@ -# Stop script execution when a non-terminating error occurs -$ErrorActionPreference = "Stop" - -$lifecycled_version = "v3.2.0" - -Write-Output "Installing lifecycled ${lifecycled_version}..." - -New-Item -ItemType directory -Path C:\lifecycled\bin - -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -Invoke-WebRequest -OutFile C:\lifecycled\bin\lifecycled.exe https://github.com/buildkite/lifecycled/releases/download/${lifecycled_version}/lifecycled-windows-amd64.exe - -Write-Output "Configure lifecycled to run on startup..." -nssm install lifecycled C:\lifecycled\bin\lifecycled.exe -If ($lastexitcode -ne 0) { Exit $lastexitcode } -nssm set lifecycled AppStdout C:\lifecycled\lifecycled.log -If ($lastexitcode -ne 0) { Exit $lastexitcode } -nssm set lifecycled AppStderr C:\lifecycled\lifecycled.log -If ($lastexitcode -ne 0) { Exit $lastexitcode } diff --git a/templates/aws-stack.yml b/templates/aws-stack.yml index 452cd74cf..5a45701a3 100644 --- a/templates/aws-stack.yml +++ b/templates/aws-stack.yml @@ -437,6 +437,12 @@ Parameters: - "false" Default: "false" + MaxInstanceLifetime: + Type: Number + Description: The maximum amount of time, in seconds, that an instance can be in service. The default is null. If specified, the value must be either 0 or a number equal to or greater than 86,400 seconds (1 day). + Default: 0 + MinValue: 0 + Rules: HasToken: Assertions: @@ -586,6 +592,9 @@ Conditions: - !Equals [ !Select [ 0, !Split [ ".", !Ref InstanceType ] ], "r6g" ] - !Equals [ !Select [ 0, !Split [ ".", !Ref InstanceType ] ], "r6gd" ] + UseMaxInstanceLifetime: + !Not [ !Equals [ !Ref MaxInstanceLifetime, 0 ] ] + Mappings: ECRManagedPolicy: none : { Policy: '' } @@ -842,7 +851,7 @@ Resources: - ec2messages:FailMessage - ec2messages:GetEndpoint - ec2messages:GetMessages - - ec2messages:SendRepl + - ec2messages:SendReply Resource: "*" Roles: - !Ref IAMRole @@ -1061,6 +1070,7 @@ Resources: MinSize: !Ref MinSize MaxSize: !Ref MaxSize Cooldown: 60 + CapacityRebalance: true MetricsCollection: - Granularity: 1Minute Metrics: @@ -1072,7 +1082,7 @@ Resources: TerminationPolicies: - OldestLaunchConfiguration - ClosestToNextInstanceHour - NewInstancesProtectedFromScaleIn: true + MaxInstanceLifetime: !If [UseMaxInstanceLifetime, !Ref MaxInstanceLifetime, !Ref "AWS::NoValue"] CreationPolicy: ResourceSignal: Timeout: !If [ UseDefaultInstanceCreationTimeout, !If [ UseWindowsAgents, PT10M, PT5M ], !Ref InstanceCreationTimeout ] @@ -1081,62 +1091,6 @@ Resources: AutoScalingReplacingUpdate: WillReplace: true - AsgProcessSuspenderRole: - Type: AWS::IAM::Role - Properties: - PermissionsBoundary: !If [ SetInstanceRolePermissionsBoundaryARN, !Ref InstanceRolePermissionsBoundaryARN, !Ref "AWS::NoValue" ] - AssumeRolePolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: - - lambda.amazonaws.com - Action: - - sts:AssumeRole - ManagedPolicyArns: - - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole - Policies: - - PolicyName: AsgProcessModification - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Action: - - 'autoscaling:SuspendProcesses' - Resource: !Sub arn:${AWS::Partition}:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${AWS::StackName}-AgentAutoScaleGroup-* - - AzRebalancingSuspenderFunction: - Type: AWS::Lambda::Function - Properties: - Description: 'Disables AZ Rebalancing on the agent ASG' - Code: - ZipFile: | - import cfnresponse - import boto3 - def handler(event, context): - try: - if event['RequestType'] == 'Delete': - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID") - else: - client = boto3.client('autoscaling') - props = event['ResourceProperties'] - response = client.suspend_processes(AutoScalingGroupName=props['AutoScalingGroupName'], ScalingProcesses=['AZRebalance']) - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID") - except BaseException as err: - print('ERROR: ', err) - cfnresponse.send(event, context, cfnresponse.FAILED, {}, "CustomResourcePhysicalID") - Handler: index.handler - Role: !GetAtt AsgProcessSuspenderRole.Arn - Runtime: 'python3.7' - - AzRebalancingSuspender: - Type: AWS::CloudFormation::CustomResource - Version: 1.0 - Properties: - ServiceToken: !GetAtt AzRebalancingSuspenderFunction.Arn - AutoScalingGroupName: !Ref AgentAutoScaleGroup - SecurityGroup: Type: AWS::EC2::SecurityGroup Condition: CreateSecurityGroup @@ -1163,7 +1117,7 @@ Resources: Properties: Location: ApplicationId: arn:aws:serverlessrepo:us-east-1:172840064832:applications/buildkite-agent-scaler - SemanticVersion: '1.1.3' + SemanticVersion: '1.2.0' Parameters: BuildkiteAgentTokenParameter: !If [ UseCustomerManagedParameterPath, !Ref BuildkiteAgentTokenParameterStorePath, !Ref BuildkiteAgentTokenParameter ] BuildkiteAgentTokenParameterStoreKMSKey: !If [ UseCustomerManagedKeyForParameterStore, !Ref BuildkiteAgentTokenParameterStoreKMSKey, "" ] @@ -1174,3 +1128,272 @@ Resources: AgentAutoScaleGroup: !Ref AgentAutoScaleGroup ScaleOutFactor: !Ref ScaleOutFactor ScaleOutForWaitingJobs: !Ref ScaleOutForWaitingJobs + DisableScaleIn: "false" + + EventBridgeRuleRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: events.amazonaws.com + Action: sts:AssumeRole + Policies: + - PolicyName: StartSsmAutomation + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: ssm:StartAutomationExecution + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${BootHookAutomation}:$DEFAULT + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${ShutdownHookAutomation}:$DEFAULT + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${SpotInteruptionAutomation}:$DEFAULT + - Effect: Allow + Action: iam:PassRole + Resource: !GetAtt AutomationRole.Arn + + AutomationRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: ssm.amazonaws.com + Action: sts:AssumeRole + Policies: + - PolicyName: RunInstanceShellScripts + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: ssm:DescribeInstanceInformation + Resource: "*" + - Effect: Allow + Action: ssm:SendCommand + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}::document/AWS-RunShellScript + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}::document/AWS-RunPowerShellScript + - Effect: Allow + Action: ssm:SendCommand + Resource: !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/* + - Effect: Allow + Action: + - ssm:ListCommands + - ssm:ListCommandInvocations + Resource: "*" + - PolicyName: CompleteLifecycleActions + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - autoscaling:CompleteLifecycleAction + - autoscaling:TerminateInstanceInAutoScalingGroup + Resource: !Sub arn:${AWS::Partition}:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${AWS::StackName}-AgentAutoScaleGroup-* + + BootHook: + Type: AWS::AutoScaling::LifecycleHook + Properties: + AutoScalingGroupName: !Ref AgentAutoScaleGroup + LifecycleHookName: BootHook + LifecycleTransition: autoscaling:EC2_INSTANCE_LAUNCHING + # We give instances five minutes to respond to this hook else we abandon + # them + HeartbeatTimeout: !If [ UseLinuxAgents, 300, 600 ] + + BootHookRule: + Type: AWS::Events::Rule + Properties: + Description: !Sub Run the boot time AWS SSM Automation for ${BootHook} + EventPattern: + source: + - aws.autoscaling + detail-type: + - "EC2 Instance-launch Lifecycle Action" + detail: + AutoScalingGroupName: [ !Ref AgentAutoScaleGroup ] + LifecycleHookName: [ !Ref BootHook ] + Targets: + - Arn: !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${BootHookAutomation}:$DEFAULT + RoleArn: !GetAtt EventBridgeRuleRole.Arn + Id: TargetSsmAutomation + InputTransformer: + InputPathsMap: + instanceid: "$.detail.EC2InstanceId" + InputTemplate: "{\"InstanceId\":[]}" + + BootHookAutomation: + Type: AWS::SSM::Document + Properties: + DocumentType: Automation + Content: + schemaVersion: "0.3" + assumeRole: !GetAtt AutomationRole.Arn + description: Start the buildkite-agent and complete the launch lifecycle action + parameters: + InstanceId: + type: String + AutoScalingGroupName: + type: String + default: !Ref AgentAutoScaleGroup + LifecycleHook: + type: String + default: !Ref BootHook + mainSteps: + - !If + - UseLinuxAgents + - name: RunCommand + action: aws:runCommand + inputs: + DocumentName: AWS-RunShellScript + InstanceIds: + - "{{ InstanceId }}" + Parameters: + executionTimeout: "300" + commands: + - systemctl start buildkite-agent + - name: RunCommand + action: aws:runCommand + inputs: + DocumentName: AWS-RunPowerShellScript + InstanceIds: + - "{{ InstanceId }}" + Parameters: + executionTimeout: "600" + commands: + - nssm start buildkite-agent + - name: CompleteLifecycleAction + action: aws:executeAwsApi + inputs: + Service: autoscaling + Api: CompleteLifecycleAction + AutoScalingGroupName: "{{ AutoScalingGroupName }}" + InstanceId: "{{ InstanceId }}" + LifecycleActionResult: "CONTINUE" + LifecycleHookName: "{{ LifecycleHook }}" + + ShutdownHook: + Type: AWS::AutoScaling::LifecycleHook + Properties: + AutoScalingGroupName: !Ref AgentAutoScaleGroup + LifecycleHookName: ShutdownHook + LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING + # We give instances one hour to respond to this hook else we continue + HeartbeatTimeout: 3600 + DefaultResult: CONTINUE + + ShutdownHookRule: + Type: AWS::Events::Rule + Properties: + Description: !Sub Run the shutdown time AWS SSM Automation for ${ShutdownHook} + EventPattern: + source: + - aws.autoscaling + detail-type: + - "EC2 Instance-terminate Lifecycle Action" + detail: + AutoScalingGroupName: [ !Ref AgentAutoScaleGroup ] + LifecycleHookName: [ !Ref ShutdownHook ] + Targets: + - Arn: !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${ShutdownHookAutomation}:$DEFAULT + RoleArn: !GetAtt EventBridgeRuleRole.Arn + Id: TargetSsmAutomation + InputTransformer: + InputPathsMap: + instanceid: "$.detail.EC2InstanceId" + InputTemplate: "{\"InstanceId\":[]}" + + ShutdownHookAutomation: + Type: AWS::SSM::Document + Properties: + DocumentType: Automation + Content: + schemaVersion: "0.3" + assumeRole: !GetAtt AutomationRole.Arn + description: Stop the buildkite-agent, wait for it to exit, complete the launch lifecycle action + parameters: + InstanceId: + type: String + AutoScalingGroupName: + type: String + default: !Ref AgentAutoScaleGroup + LifecycleHook: + type: String + default: !Ref ShutdownHook + mainSteps: + - !If + - UseLinuxAgents + - name: RunCommand + action: aws:runCommand + inputs: + DocumentName: AWS-RunShellScript + InstanceIds: + - "{{ InstanceId }}" + Parameters: + executionTimeout: "3600" + commands: + - systemctl stop buildkite-agent + - name: RunCommand + action: aws:runCommand + inputs: + DocumentName: AWS-RunPowerShellScript + InstanceIds: + - "{{ InstanceId }}" + Parameters: + executionTimeout: "3600" + commands: + - nssm stop buildkite-agent + - name: CompleteLifecycleAction + action: aws:executeAwsApi + inputs: + Service: autoscaling + Api: CompleteLifecycleAction + AutoScalingGroupName: "{{ AutoScalingGroupName }}" + InstanceId: "{{ InstanceId }}" + LifecycleActionResult: "CONTINUE" + LifecycleHookName: "{{ LifecycleHook }}" + + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html#spot-instance-termination-notices + SpotInterruptionRule: + Type: AWS::Events::Rule + Properties: + Description: !Sub Run the spot interruption AWS SSM Automation for ${AgentAutoScaleGroup} + EventPattern: + source: + - aws.ec2 + detail-type: + - EC2 Spot Instance Interruption Warning + Targets: + - Arn: !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:automation-definition/${SpotInteruptionAutomation}:$DEFAULT + RoleArn: !GetAtt EventBridgeRuleRole.Arn + Id: TargetSsmAutomation + InputTransformer: + InputPathsMap: + instanceid: "$.detail.instance-id" + InputTemplate: "{\"InstanceId\":[]}" + + SpotInteruptionAutomation: + Type: AWS::SSM::Document + Properties: + DocumentType: Automation + Content: + schemaVersion: "0.3" + assumeRole: !GetAtt AutomationRole.Arn + description: Terminate the instance in the Auto Scaling group, making it go through the shutdown hook. + parameters: + InstanceId: + type: String + AutoScalingGroupName: + type: String + default: !Ref AgentAutoScaleGroup + mainSteps: + - name: TerminateInstanceInAutoScalingGroup + action: aws:executeAwsApi + inputs: + Service: autoscaling + Api: TerminateInstanceInAutoScalingGroup + InstanceId: "{{ InstanceId }}" + ShouldDecrementDesiredCapacity: false diff --git a/templates/service-role.yml b/templates/service-role.yml index f63469f9b..b4292640c 100644 --- a/templates/service-role.yml +++ b/templates/service-role.yml @@ -214,6 +214,22 @@ Resources: ], "Resource": "arn:aws:ssm:*:*:parameter/*" }, + { + "Effect": "Allow", + "Action": [ + "ssm:ListDocuments", + "ssm:CreateDocument", + "ssm:GetDocument", + "ssm:UpdateDocument", + "ssm:UpdateDocumentMetadata", + "ssm:DescribeDocument", + "ssm:DeleteDocument", + "ssm:ListTagsForResource", + "ssm:AddTagsToResource", + "ssm:RemoveTagsFromResource" + ], + "Resource": "*" + }, { "Effect": "Allow", "Action": [