Skip to content

Commit 0e6bbc2

Browse files
committed
Granular max-in-flight updates
* Scale action now compares number of starting/running non-routable instances to max-in-flight, instead of waiting for all instances to become routable * Scale action does not continue scaling up when any instances are 'unhealthy' (e.g. 'crashed', 'down', etc) as it's difficult to determine if unhealthy instances belong to the 'max in flight' group * Number of desired nondeploying instances now recalculated each iteration instead of decrementing by 'max-in-flight' without checking if it's in a correct state. This mitigates bugs where deployment trains (continually creating new deployments before the previous had completed) could result in number app instances exceeding the max-in-flight limit.
1 parent 77e7b15 commit 0e6bbc2

File tree

2 files changed

+291
-49
lines changed
  • lib/cloud_controller/deployment_updater/actions
  • spec/unit/lib/cloud_controller/deployment_updater/actions

2 files changed

+291
-49
lines changed

lib/cloud_controller/deployment_updater/actions/scale.rb

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ module VCAP::CloudController
66
module DeploymentUpdater
77
module Actions
88
class Scale
9+
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
910
attr_reader :deployment, :logger, :app
1011

1112
def initialize(deployment, logger)
@@ -17,14 +18,15 @@ def initialize(deployment, logger)
1718
def call
1819
deployment.db.transaction do
1920
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
20-
return unless all_instances_routable?
21+
22+
return unless can_scale? || can_downscale?
2123

2224
app.lock!
25+
2326
oldest_web_process_with_instances.lock!
2427
deploying_web_process.lock!
2528

2629
deployment.update(
27-
last_healthy_at: Time.now,
2830
state: DeploymentModel::DEPLOYING_STATE,
2931
status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
3032
status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
@@ -36,36 +38,86 @@ def call
3638
end
3739

3840
ScaleDownCanceledProcesses.new(deployment).call
39-
ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call
4041

41-
deploying_web_process.update(instances: desired_new_instances)
42+
scale_down_old_processes if can_downscale?
43+
44+
if can_scale?
45+
deploying_web_process.update(instances: desired_new_instances)
46+
deployment.update(last_healthy_at: Time.now)
47+
end
4248
end
4349
end
4450

4551
private
4652

47-
def desired_old_instances
48-
[(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max
53+
def scale_down_old_processes
54+
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
55+
56+
return if instances_to_reduce <= 0
57+
58+
non_deploying_web_processes.each do |process|
59+
if instances_to_reduce < process.instances
60+
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
61+
break
62+
end
63+
64+
instances_to_reduce -= process.instances
65+
ScaleDownOldProcess.new(deployment, process, 0).call
66+
end
67+
end
68+
69+
def can_scale?
70+
starting_instances.count < deployment.max_in_flight && unhealthy_instances.count == 0
71+
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
72+
logger.info("skipping-deployment-update-for-#{deployment.guid}")
73+
false
74+
end
75+
76+
def can_downscale?
77+
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
78+
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
79+
logger.info("skipping-deployment-update-for-#{deployment.guid}")
80+
false
81+
end
82+
83+
def desired_non_deploying_instances
84+
[deployment.original_web_process_instance_count - routable_instances.count, 0].max
4985
end
5086

5187
def desired_new_instances
52-
[deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min
88+
[deploying_web_process.instances + deployment.max_in_flight - starting_instances.count, deployment.original_web_process_instance_count].min
5389
end
5490

5591
def oldest_web_process_with_instances
5692
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
5793
end
5894

95+
def non_deploying_web_processes
96+
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
97+
end
98+
5999
def deploying_web_process
60100
@deploying_web_process ||= deployment.deploying_web_process
61101
end
62102

63-
def all_instances_routable?
64-
instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
65-
instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
66-
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
67-
logger.info("skipping-deployment-update-for-#{deployment.guid}")
68-
false
103+
def starting_instances
104+
healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
105+
end
106+
107+
def routable_instances
108+
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
109+
end
110+
111+
def healthy_instances
112+
reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
113+
end
114+
115+
def unhealthy_instances
116+
reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
117+
end
118+
119+
def reported_instances
120+
@reported_instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
69121
end
70122

71123
def instance_reporters

0 commit comments

Comments
 (0)