Skip to content

Commit d6e38e4

Browse files
committed
Granular max-in-flight updates (wip)
1 parent 9e1a90d commit d6e38e4

File tree

2 files changed

+221
-37
lines changed
  • lib/cloud_controller/deployment_updater/actions
  • spec/unit/lib/cloud_controller/deployment_updater/actions

2 files changed

+221
-37
lines changed

lib/cloud_controller/deployment_updater/actions/scale.rb

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ def initialize(deployment, logger)
1717
def call
1818
deployment.db.transaction do
1919
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
20-
return unless all_instances_routable?
20+
return unless has_space_to_scale?
2121

2222
app.lock!
23+
2324
oldest_web_process_with_instances.lock!
2425
deploying_web_process.lock!
2526

@@ -36,36 +37,64 @@ def call
3637
end
3738

3839
ScaleDownCanceledProcesses.new(deployment).call
39-
ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call
4040

41+
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
42+
43+
if instances_to_reduce > 0
44+
45+
non_deploying_web_processes.each do |process|
46+
if instances_to_reduce > process.instances
47+
instances_to_reduce -= process.instances
48+
ScaleDownOldProcess.new(deployment, process, 0).call
49+
else
50+
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
51+
break
52+
end
53+
end
54+
end
4155
deploying_web_process.update(instances: desired_new_instances)
4256
end
4357
end
4458

4559
private
4660

47-
def desired_old_instances
48-
[(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max
61+
def has_space_to_scale?
62+
nonroutable_instance_count < deployment.max_in_flight
63+
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
64+
logger.info("skipping-deployment-update-for-#{deployment.guid}")
65+
false
66+
end
67+
68+
def desired_non_deploying_instances
69+
[deployment.original_web_process_instance_count - routable_instance_count, 0].max
4970
end
5071

5172
def desired_new_instances
52-
[deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min
73+
[deploying_web_process.instances + deployment.max_in_flight - nonroutable_instance_count, deployment.original_web_process_instance_count].min
5374
end
5475

5576
def oldest_web_process_with_instances
5677
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
5778
end
5879

80+
def non_deploying_web_processes
81+
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
82+
end
83+
5984
def deploying_web_process
6085
@deploying_web_process ||= deployment.deploying_web_process
6186
end
6287

63-
def all_instances_routable?
64-
instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
65-
instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
66-
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
67-
logger.info("skipping-deployment-update-for-#{deployment.guid}")
68-
false
88+
def routable_instance_count
89+
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }.count
90+
end
91+
92+
def nonroutable_instance_count
93+
reported_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }.count
94+
end
95+
96+
def reported_instances
97+
@reported_instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
6998
end
7099

71100
def instance_reporters

spec/unit/lib/cloud_controller/deployment_updater/actions/scale_spec.rb

Lines changed: 181 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ module VCAP::CloudController
3131
let!(:deploying_route_mapping) { RouteMappingModel.make(app: web_process.app, process_type: deploying_web_process.type) }
3232
let(:space) { web_process.space }
3333
let(:original_web_process_instance_count) { 6 }
34-
let(:current_web_instances) { 2 }
34+
let(:current_web_instances) { 6 }
3535
let(:current_deploying_instances) { 0 }
3636

3737
let(:state) { DeploymentModel::DEPLOYING_STATE }
@@ -42,17 +42,19 @@ module VCAP::CloudController
4242
deploying_web_process: deploying_web_process,
4343
state: state,
4444
original_web_process_instance_count: original_web_process_instance_count,
45-
max_in_flight: 1
45+
max_in_flight: max_in_flight
4646
)
4747
end
4848

49+
let(:max_in_flight) { 1 }
50+
4951
let(:diego_instances_reporter) { instance_double(Diego::InstancesReporter) }
5052
let(:all_instances_results) do
51-
{
52-
0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
53-
1 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
54-
2 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }
55-
}
53+
instances = {}
54+
current_deploying_instances.times do |i|
55+
instances[i] = { state: 'RUNNING', uptime: 50, since: 2, routable: true }
56+
end
57+
instances
5658
end
5759
let(:instances_reporters) { double(:instance_reporters) }
5860
let(:logger) { instance_double(Steno::Logger, info: nil, error: nil) }
@@ -68,13 +70,17 @@ module VCAP::CloudController
6870
expect(deployment).to have_received(:lock!)
6971
end
7072

71-
it 'scales the old web process down by one after the first iteration' do
72-
expect(original_web_process_instance_count).to be > current_deploying_instances
73-
expect do
74-
subject.call
75-
end.to change {
76-
web_process.reload.instances
77-
}.by(-1)
73+
context 'after a new instance has been brought up' do
74+
let(:current_deploying_instances) { 1 }
75+
76+
it 'scales the old web process down by one' do
77+
expect(original_web_process_instance_count).to be > current_deploying_instances
78+
expect do
79+
subject.call
80+
end.to change {
81+
web_process.reload.instances
82+
}.by(-1)
83+
end
7884
end
7985

8086
it 'scales up the new web process by one' do
@@ -104,12 +110,12 @@ module VCAP::CloudController
104110
}.by(2)
105111
end
106112

107-
it 'scales the old web process down by two after the first iteration' do
113+
it 'doesnt scale down the old web process (there are no new routable instances yet)' do
108114
expect do
109115
subject.call
110-
end.to change {
116+
end.not_to(change do
111117
web_process.reload.instances
112-
}.by(-2)
118+
end)
113119
end
114120
end
115121

@@ -125,6 +131,8 @@ module VCAP::CloudController
125131
)
126132
end
127133

134+
let(:current_web_instances) { 1 }
135+
128136
it 'scales up the new web process by the maximum number' do
129137
expect do
130138
subject.call
@@ -133,12 +141,12 @@ module VCAP::CloudController
133141
}.by(1)
134142
end
135143

136-
it 'scales the old web process down to 0' do
144+
it 'doesnt scale down the old web process (there are no new routable instances yet)' do
137145
expect do
138146
subject.call
139-
end.to change {
147+
end.not_to(change do
140148
web_process.reload.instances
141-
}.to(0)
149+
end)
142150
end
143151
end
144152

@@ -161,12 +169,12 @@ module VCAP::CloudController
161169
}.by(original_web_process_instance_count)
162170
end
163171

164-
it 'scales the old web process down to 0' do
172+
it 'doesnt scale down the old web process (there is no new routable instance yet)' do
165173
expect do
166174
subject.call
167-
end.to change {
175+
end.not_to(change do
168176
web_process.reload.instances
169-
}.to(0)
177+
end)
170178
end
171179
end
172180

@@ -216,6 +224,17 @@ module VCAP::CloudController
216224
context 'when the (oldest) web process will be at zero instances and is type web' do
217225
let(:current_web_instances) { 1 }
218226
let(:current_deploying_instances) { 3 }
227+
let(:original_web_process_instance_count) { 6 }
228+
229+
let!(:interim_deploying_web_process) do
230+
ProcessModel.make(
231+
app: web_process.app,
232+
created_at: an_hour_ago,
233+
type: ProcessTypes::WEB,
234+
instances: 3,
235+
guid: 'guid-interim'
236+
)
237+
end
219238

220239
it 'does not destroy the web process, but scales it to 0' do
221240
subject.call
@@ -266,6 +285,14 @@ module VCAP::CloudController
266285
type: ProcessTypes::WEB
267286
)
268287
end
288+
let!(:other_web_process_with_instances) do
289+
ProcessModel.make(
290+
instances: 10,
291+
app: app,
292+
created_at: 1.hour.ago,
293+
type: ProcessTypes::WEB
294+
)
295+
end
269296

270297
it 'destroys the oldest web process and ignores the original web process' do
271298
expect do
@@ -275,8 +302,134 @@ module VCAP::CloudController
275302
end
276303
end
277304

278-
context 'when one of the deploying_web_process instances is starting' do
305+
context 'when there are more instances in interim processes than there should be' do
306+
let(:current_deploying_instances) { 10 }
307+
let(:original_web_process_instance_count) { 20 }
308+
let(:max_in_flight) { 4 }
309+
310+
let!(:web_process) do
311+
ProcessModel.make(
312+
guid: 'web_process',
313+
instances: 10,
314+
app: app,
315+
created_at: a_day_ago - 11,
316+
type: ProcessTypes::WEB
317+
)
318+
end
319+
let!(:oldest_web_process_with_instances) do
320+
ProcessModel.make(
321+
guid: 'oldest_web_process_with_instances',
322+
instances: 10,
323+
app: app,
324+
created_at: a_day_ago - 10,
325+
type: ProcessTypes::WEB
326+
)
327+
end
328+
let!(:other_web_process_with_instances) do
329+
ProcessModel.make(
330+
instances: 10,
331+
app: app,
332+
created_at: a_day_ago - 9,
333+
type: ProcessTypes::WEB
334+
)
335+
end
336+
337+
it 'scales down interim proceses so all instances equal original instance count + max in flight' do
338+
non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum
339+
expect(non_deploying_instance_count).to eq 30
340+
expect(deploying_web_process.instances).to eq 10
341+
342+
subject.call
343+
344+
non_deploying_instance_count = app.reload.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum
345+
expect(non_deploying_instance_count).to eq 10
346+
expect(deploying_web_process.reload.instances).to eq 14
347+
end
348+
349+
it 'destroys interim processes that have been scaled down' do
350+
subject.call
351+
expect(ProcessModel.find(guid: web_process.guid)).to be_present
352+
expect(ProcessModel.find(guid: oldest_web_process_with_instances.guid)).to be_nil
353+
expect(ProcessModel.find(guid: other_web_process_with_instances.guid)).to be_present
354+
end
355+
end
356+
357+
context 'when there are fewer instances in interim processes than there should be' do
358+
let(:current_deploying_instances) { 10 }
359+
let(:original_web_process_instance_count) { 20 }
360+
let(:max_in_flight) { 4 }
361+
362+
let!(:web_process) do
363+
ProcessModel.make(
364+
guid: 'web_process',
365+
instances: 1,
366+
app: app,
367+
created_at: a_day_ago - 11,
368+
type: ProcessTypes::WEB
369+
)
370+
end
371+
let!(:oldest_web_process_with_instances) do
372+
ProcessModel.make(
373+
guid: 'oldest_web_process_with_instances',
374+
instances: 1,
375+
app: app,
376+
created_at: a_day_ago - 10,
377+
type: ProcessTypes::WEB
378+
)
379+
end
380+
let!(:other_web_process_with_instances) do
381+
ProcessModel.make(
382+
instances: 1,
383+
app: app,
384+
created_at: a_day_ago - 10,
385+
type: ProcessTypes::WEB
386+
)
387+
end
388+
389+
it 'doesnt try to scale up or down the iterim processes' do
390+
non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum
391+
expect(non_deploying_instance_count).to eq 3
392+
expect(deploying_web_process.reload.instances).to eq 10
393+
394+
subject.call
395+
396+
non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum
397+
expect(non_deploying_instance_count).to eq 3
398+
expect(deploying_web_process.reload.instances).to eq 14
399+
end
400+
end
401+
402+
context 'when some, but not all, instances have finished' do
403+
let(:current_web_instances) { 10 }
404+
let(:original_web_process_instance_count) { 10 }
405+
406+
let(:current_deploying_instances) { 5 }
407+
let(:max_in_flight) { 5 }
408+
409+
let(:all_instances_results) do
410+
{
411+
0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
412+
1 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
413+
2 => { state: 'STARTING', uptime: 50, since: 2, routable: true },
414+
3 => { state: 'RUNNING', uptime: 50, since: 2, routable: false },
415+
4 => { state: 'FAILING', uptime: 50, since: 2, routable: false }
416+
}
417+
end
418+
419+
it 'scales more instances to match max_in_flight' do
420+
expect(web_process.instances).to eq 10
421+
expect(deploying_web_process.instances).to eq 5
422+
423+
subject.call
424+
425+
expect(web_process.reload.instances).to eq 8
426+
expect(deploying_web_process.reload.instances).to eq 7
427+
end
428+
end
429+
430+
context 'when greater than or equal to max_in_flight of the deploying_web_process instances is starting' do
279431
let(:current_deploying_instances) { 3 }
432+
let(:max_in_flight) { 2 }
280433
let(:all_instances_results) do
281434
{
282435
0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
@@ -300,8 +453,9 @@ module VCAP::CloudController
300453
end
301454
end
302455

303-
context 'when one of the deploying_web_process instances is not routable' do
456+
context 'when greater than or equal to max_in_flight of the deploying_web_process instances is not routable' do
304457
let(:current_deploying_instances) { 3 }
458+
let(:max_in_flight) { 2 }
305459
let(:all_instances_results) do
306460
{
307461
0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },
@@ -325,8 +479,9 @@ module VCAP::CloudController
325479
end
326480
end
327481

328-
context 'when one of the deploying_web_process instances is failing' do
482+
context 'when greater than or equal to max_in_flight of the deploying_web_process instances is failing' do
329483
let(:current_deploying_instances) { 3 }
484+
let(:max_in_flight) { 2 }
330485
let(:all_instances_results) do
331486
{
332487
0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true },

0 commit comments

Comments
 (0)