Skip to content

Commit cfd3d29

Browse files
authored
Deployment scale refactor (#4212)
* Refactor finalize out of scale action * also start separating total instance count from current scale target * Breakout Scale into UpScale/DownScale actions * also move logic into instance_reporters
1 parent 57ee2c3 commit cfd3d29

File tree

13 files changed

+695
-145
lines changed

13 files changed

+695
-145
lines changed

lib/cloud_controller/backends/instances_reporters.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def stats_for_app(app)
2727
raise CloudController::Errors::ApiError.new_from_details('StatsUnavailable', 'Stats server temporarily unavailable.')
2828
end
2929

30-
delegate :number_of_starting_and_running_instances_for_processes, to: :diego_reporter
30+
delegate :number_of_starting_and_running_instances_for_processes, :instance_count_summary, to: :diego_reporter
3131

3232
private
3333

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
require 'cloud_controller/deployment_updater/actions/scale_down_old_process'
2+
module VCAP::CloudController
3+
module DeploymentUpdater
4+
module Actions
5+
class DownScaler
6+
attr_reader :deployment, :logger, :app
7+
8+
def initialize(deployment, logger, target_total_instance_count, routable_instance_count)
9+
@deployment = deployment
10+
@app = deployment.app
11+
@logger = logger
12+
@target_total_instance_count = target_total_instance_count
13+
@routable_instance_count = routable_instance_count
14+
end
15+
16+
def scale_down
17+
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
18+
19+
return if instances_to_reduce <= 0
20+
21+
non_deploying_web_processes.each do |process|
22+
if instances_to_reduce < process.instances
23+
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
24+
break
25+
end
26+
27+
instances_to_reduce -= process.instances
28+
ScaleDownOldProcess.new(deployment, process, 0).call
29+
end
30+
end
31+
32+
def can_downscale?
33+
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
34+
end
35+
36+
def desired_non_deploying_instances
37+
[@target_total_instance_count - @routable_instance_count, 0].max
38+
end
39+
40+
private
41+
42+
def non_deploying_web_processes
43+
app.web_processes.reject { |process| process.guid == deployment.deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
44+
end
45+
end
46+
end
47+
end
48+
end

lib/cloud_controller/deployment_updater/actions/finalize.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def call
1515

1616
update_non_web_processes
1717
restart_non_web_processes
18+
1819
deployment.update(
1920
state: DeploymentModel::DEPLOYED_STATE,
2021
status_value: DeploymentModel::FINALIZED_STATUS_VALUE,

lib/cloud_controller/deployment_updater/actions/scale.rb

Lines changed: 18 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,70 @@
11
require 'cloud_controller/deployment_updater/actions/scale_down_canceled_processes'
2-
require 'cloud_controller/deployment_updater/actions/scale_down_old_process'
32
require 'cloud_controller/deployment_updater/actions/finalize'
3+
require 'cloud_controller/deployment_updater/actions/down_scaler'
4+
require 'cloud_controller/deployment_updater/actions/up_scaler'
5+
require 'cloud_controller/diego/constants'
46

57
module VCAP::CloudController
68
module DeploymentUpdater
79
module Actions
810
class Scale
9-
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
10-
attr_reader :deployment, :logger, :app
11+
attr_reader :deployment, :logger, :app, :target_total_instance_count, :interim_desired_instance_count
1112

12-
def initialize(deployment, logger)
13+
def initialize(deployment, logger, target_total_instance_count, interim_desired_instance_count=nil)
1314
@deployment = deployment
1415
@logger = logger
1516
@app = deployment.app
17+
@target_total_instance_count = target_total_instance_count
18+
@interim_desired_instance_count = interim_desired_instance_count || target_total_instance_count
1619
end
1720

1821
def call
22+
down_scaler = DownScaler.new(deployment, logger, target_total_instance_count, instance_count_summary.routable_instances_count)
23+
up_scaler = UpScaler.new(deployment, logger, interim_desired_instance_count, instance_count_summary)
24+
1925
deployment.db.transaction do
2026
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
21-
22-
return unless can_scale? || can_downscale?
27+
return unless up_scaler.can_scale? || down_scaler.can_downscale?
2328

2429
app.lock!
2530

2631
oldest_web_process_with_instances.lock!
2732
deploying_web_process.lock!
2833

2934
deployment.update(
30-
state: DeploymentModel::DEPLOYING_STATE,
3135
status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
3236
status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
3337
)
3438

35-
if deploying_web_process.instances >= deployment.original_web_process_instance_count
36-
Finalize.new(deployment).call
37-
return
38-
end
39-
4039
ScaleDownCanceledProcesses.new(deployment).call
4140

42-
scale_down_old_processes if can_downscale?
43-
44-
if can_scale?
45-
deploying_web_process.update(instances: desired_new_instances)
46-
deployment.update(last_healthy_at: Time.now)
47-
end
48-
end
49-
end
50-
51-
private
41+
down_scaler.scale_down if down_scaler.can_downscale?
5242

53-
def scale_down_old_processes
54-
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
43+
return true if up_scaler.finished_scaling?
5544

56-
return if instances_to_reduce <= 0
57-
58-
non_deploying_web_processes.each do |process|
59-
if instances_to_reduce < process.instances
60-
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
61-
break
62-
end
63-
64-
instances_to_reduce -= process.instances
65-
ScaleDownOldProcess.new(deployment, process, 0).call
45+
up_scaler.scale_up if up_scaler.can_scale?
6646
end
67-
end
68-
69-
def can_scale?
70-
starting_instances.count < deployment.max_in_flight &&
71-
unhealthy_instances.count == 0 &&
72-
routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight
73-
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
74-
logger.info("skipping-deployment-update-for-#{deployment.guid}")
7547
false
76-
end
77-
78-
def can_downscale?
79-
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
8048
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
8149
logger.info("skipping-deployment-update-for-#{deployment.guid}")
8250
false
8351
end
8452

85-
def desired_non_deploying_instances
86-
[deployment.original_web_process_instance_count - routable_instances.count, 0].max
87-
end
88-
89-
def desired_new_instances
90-
[routable_instances.count + deployment.max_in_flight, deployment.original_web_process_instance_count].min
91-
end
53+
private
9254

9355
def oldest_web_process_with_instances
56+
# TODO: lock all web processes? We might alter all of them, depending on max-in-flight size
9457
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
9558
end
9659

97-
def non_deploying_web_processes
98-
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
60+
def instance_count_summary
61+
@instance_count_summary ||= instance_reporters.instance_count_summary(deploying_web_process)
9962
end
10063

10164
def deploying_web_process
10265
@deploying_web_process ||= deployment.deploying_web_process
10366
end
10467

105-
def starting_instances
106-
healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
107-
end
108-
109-
def routable_instances
110-
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
111-
end
112-
113-
def healthy_instances
114-
reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
115-
end
116-
117-
def unhealthy_instances
118-
reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
119-
end
120-
121-
def reported_instances
122-
@reported_instances = instance_reporters.all_instances_for_app(deploying_web_process)
123-
end
124-
12568
def instance_reporters
12669
CloudController::DependencyLocator.instance.instances_reporters
12770
end
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
module VCAP::CloudController
2+
module DeploymentUpdater
3+
module Actions
4+
class UpScaler
5+
attr_reader :deployment, :logger, :app, :interim_desired_instance_count
6+
7+
def initialize(deployment, logger, interim_desired_instance_count, instance_count_summary)
8+
@deployment = deployment
9+
@app = deployment.app
10+
@logger = logger
11+
@interim_desired_instance_count = interim_desired_instance_count
12+
@starting_instances_count = instance_count_summary.starting_instances_count
13+
@unhealthy_instances_count = instance_count_summary.unhealthy_instances_count
14+
@routable_instances_count = instance_count_summary.routable_instances_count
15+
end
16+
17+
def scale_up
18+
return unless can_scale?
19+
20+
deploying_web_process.update(instances: desired_new_instances)
21+
deployment.update(last_healthy_at: Time.now)
22+
end
23+
24+
def can_scale?
25+
@starting_instances_count < deployment.max_in_flight &&
26+
@unhealthy_instances_count == 0 &&
27+
# if routable instances is < deploying_web_process.instances - deployment.max_in_flight
28+
# then that indicates that Diego isnt in sync with CAPI yet
29+
@routable_instances_count >= deploying_web_process.instances - deployment.max_in_flight
30+
end
31+
32+
def finished_scaling?
33+
deploying_web_process.instances >= interim_desired_instance_count
34+
end
35+
36+
private
37+
38+
def desired_new_instances
39+
[@routable_instances_count + deployment.max_in_flight, interim_desired_instance_count].min
40+
end
41+
42+
def deploying_web_process
43+
@deploying_web_process ||= deployment.deploying_web_process
44+
end
45+
end
46+
end
47+
end
48+
end

lib/cloud_controller/deployment_updater/updater.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
require 'cloud_controller/deployment_updater/actions/scale'
22
require 'cloud_controller/deployment_updater/actions/cancel'
33
require 'cloud_controller/deployment_updater/actions/canary'
4+
require 'cloud_controller/deployment_updater/actions/finalize'
45
module VCAP::CloudController
56
module DeploymentUpdater
67
class Updater
@@ -13,7 +14,8 @@ def initialize(deployment, logger)
1314

1415
def scale
1516
with_error_logging('error-scaling-deployment') do
16-
Actions::Scale.new(deployment, logger).call
17+
finished = Actions::Scale.new(deployment, logger, deployment.original_web_process_instance_count).call
18+
Actions::Finalize.new(deployment).call if finished
1719
logger.info("ran-deployment-update-for-#{deployment.guid}")
1820
end
1921
end

lib/cloud_controller/diego/reporters/instances_reporter.rb

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ module VCAP::CloudController
66
module Diego
77
class InstancesReporter
88
include ReporterMixins
9-
9+
InstanceCountSummary = Struct.new(:starting_instances_count, :routable_instances_count, :healthy_instances_count, :unhealthy_instances_count)
10+
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
1011
UNKNOWN_INSTANCE_COUNT = -1
1112

1213
def initialize(bbs_instances_client)
@@ -105,6 +106,17 @@ def crashed_instances_for_app(process)
105106
raise CloudController::Errors::InstancesUnavailable.new(e)
106107
end
107108

109+
def instance_count_summary(process)
110+
instances = all_instances_for_app(process)
111+
112+
healthy_instances = instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
113+
unhealthy_instances = instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
114+
starting_instances = healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
115+
routable_instances = instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
116+
117+
InstanceCountSummary.new(starting_instances.count, routable_instances.count, healthy_instances.count, unhealthy_instances.count)
118+
end
119+
108120
private
109121

110122
attr_reader :bbs_instances_client

lib/cloud_controller/metrics/prometheus_updater.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
require 'prometheus/client'
2+
require 'prometheus/client/data_stores/direct_file_store'
23

34
module VCAP::CloudController::Metrics
45
class PrometheusUpdater

0 commit comments

Comments
 (0)