Skip to content

Commit 749e6fc

Browse files
authored
Granular max-in-flight updates (#4124)
* Scale action now compares number of starting/running non-routable instances to max-in-flight, instead of waiting for all instances to become routable * Scale action does not continue scaling up when any instances are 'unhealthy' (e.g. 'crashed', 'down', etc) as it's difficult to determine if unhealthy instances belong to the 'max in flight' group * Number of desired nondeploying instances now recalculated each iteration instead of decrementing by 'max-in-flight' without checking if it's in a correct state. This mitigates bugs where deployment trains (continually creating new deployments before the previous had completed) could result in number app instances exceeding the max-in-flight limit.
1 parent 7536b72 commit 749e6fc

File tree

2 files changed

+317
-49
lines changed
  • lib/cloud_controller/deployment_updater/actions
  • spec/unit/lib/cloud_controller/deployment_updater/actions

2 files changed

+317
-49
lines changed

lib/cloud_controller/deployment_updater/actions/scale.rb

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ module VCAP::CloudController
66
module DeploymentUpdater
77
module Actions
88
class Scale
9+
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
910
attr_reader :deployment, :logger, :app
1011

1112
def initialize(deployment, logger)
@@ -17,14 +18,15 @@ def initialize(deployment, logger)
1718
def call
1819
deployment.db.transaction do
1920
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
20-
return unless all_instances_routable?
21+
22+
return unless can_scale? || can_downscale?
2123

2224
app.lock!
25+
2326
oldest_web_process_with_instances.lock!
2427
deploying_web_process.lock!
2528

2629
deployment.update(
27-
last_healthy_at: Time.now,
2830
state: DeploymentModel::DEPLOYING_STATE,
2931
status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
3032
status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
@@ -36,36 +38,88 @@ def call
3638
end
3739

3840
ScaleDownCanceledProcesses.new(deployment).call
39-
ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call
4041

41-
deploying_web_process.update(instances: desired_new_instances)
42+
scale_down_old_processes if can_downscale?
43+
44+
if can_scale?
45+
deploying_web_process.update(instances: desired_new_instances)
46+
deployment.update(last_healthy_at: Time.now)
47+
end
4248
end
4349
end
4450

4551
private
4652

47-
def desired_old_instances
48-
[(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max
53+
def scale_down_old_processes
54+
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
55+
56+
return if instances_to_reduce <= 0
57+
58+
non_deploying_web_processes.each do |process|
59+
if instances_to_reduce < process.instances
60+
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
61+
break
62+
end
63+
64+
instances_to_reduce -= process.instances
65+
ScaleDownOldProcess.new(deployment, process, 0).call
66+
end
67+
end
68+
69+
def can_scale?
70+
starting_instances.count < deployment.max_in_flight &&
71+
unhealthy_instances.count == 0 &&
72+
routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight
73+
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
74+
logger.info("skipping-deployment-update-for-#{deployment.guid}")
75+
false
76+
end
77+
78+
def can_downscale?
79+
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
80+
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
81+
logger.info("skipping-deployment-update-for-#{deployment.guid}")
82+
false
83+
end
84+
85+
def desired_non_deploying_instances
86+
[deployment.original_web_process_instance_count - routable_instances.count, 0].max
4987
end
5088

5189
def desired_new_instances
52-
[deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min
90+
[routable_instances.count + deployment.max_in_flight, deployment.original_web_process_instance_count].min
5391
end
5492

5593
def oldest_web_process_with_instances
5694
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
5795
end
5896

97+
def non_deploying_web_processes
98+
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
99+
end
100+
59101
def deploying_web_process
60102
@deploying_web_process ||= deployment.deploying_web_process
61103
end
62104

63-
def all_instances_routable?
64-
instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
65-
instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
66-
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
67-
logger.info("skipping-deployment-update-for-#{deployment.guid}")
68-
false
105+
def starting_instances
106+
healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
107+
end
108+
109+
def routable_instances
110+
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
111+
end
112+
113+
def healthy_instances
114+
reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
115+
end
116+
117+
def unhealthy_instances
118+
reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
119+
end
120+
121+
def reported_instances
122+
@reported_instances = instance_reporters.all_instances_for_app(deploying_web_process)
69123
end
70124

71125
def instance_reporters

0 commit comments

Comments
 (0)