Skip to content

Commit 58b2c0e

Browse files
authored
Cleanup job for operations stuck in create/initial state (#3742)
* check in short frequency for operations in state 'create'/'initial' where 'updated_at' is older than current time plus service broker connection timeout plus 10% safety margin * set all matches to 'failed' so that users are able to delete the associated entity manually (service instance / service binding / key binding / route service binding) * call orphan mitigation for each affected service
1 parent 337bf71 commit 58b2c0e

File tree

9 files changed

+317
-6
lines changed

9 files changed

+317
-6
lines changed

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,15 @@ When the Docker containers have been set up as described above, you can start th
114114
```
115115
Then start a local worker:
116116
```
117-
export CLOUD_CONTROLLER_NG_CONFIG=./tmp/cloud_controller.yml
118-
bundle exec rake jobs:local
117+
CLOUD_CONTROLLER_NG_CONFIG=./tmp/cloud_controller.yml bundle exec rake jobs:local
119118
```
120-
And finally start a delayed_job worker:
119+
Start a delayed_job worker:
121120
```
122-
export CLOUD_CONTROLLER_NG_CONFIG=./tmp/cloud_controller.yml
123-
bundle exec rake jobs:generic
121+
CLOUD_CONTROLLER_NG_CONFIG=./tmp/cloud_controller.yml bundle exec rake jobs:generic
122+
```
123+
And finally start the scheduler:
124+
```
125+
CLOUD_CONTROLLER_NG_CONFIG=./tmp/cloud_controller.yml bundle exec rake clock:start
124126
```
125127

126128
Known limitations:
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
module VCAP::CloudController
2+
module Jobs
3+
module Runtime
4+
class ServiceOperationsInitialCleanup < VCAP::CloudController::Jobs::CCJob
5+
def perform
6+
cleanup_operations(ServiceInstanceOperation, ServiceInstance, :service_instance_id, :cleanup_failed_provision)
7+
cleanup_operations(ServiceBindingOperation, ServiceBinding, :service_binding_id, :cleanup_failed_bind)
8+
cleanup_operations(ServiceKeyOperation, ServiceKey, :service_key_id, :cleanup_failed_key)
9+
cleanup_operations(RouteBindingOperation, RouteBinding, :route_binding_id, nil)
10+
end
11+
12+
def cleanup_operations(operation_model, instance_model, foreign_key, orphan_mitigator_method)
13+
operations_create_initial = operation_model.
14+
where(type: 'create', state: 'initial').
15+
where(updated_at_past_broker_timeout).
16+
select(:id, foreign_key)
17+
18+
return if operations_create_initial.empty?
19+
20+
operations_create_initial.each do |result|
21+
instance = instance_model.first(id: result[foreign_key])
22+
logger.info("#{instance_model.to_s.split('::').last} #{instance[:guid]} is stuck in state 'create'/'initial'. " \
23+
"Setting state to 'failed' and triggering orphan mitigation.")
24+
operation_model.first(id: result[:id]).update(state: 'failed',
25+
description: 'Operation was stuck in "initial" state. Set to "failed" by cleanup job.')
26+
27+
orphan_mitigator.send(orphan_mitigator_method, instance) unless orphan_mitigator_method.nil?
28+
end
29+
end
30+
31+
def updated_at_past_broker_timeout
32+
Sequel.lit(
33+
"updated_at < ? - INTERVAL '?' SECOND",
34+
Sequel::CURRENT_TIMESTAMP,
35+
broker_client_timeout_plus_margin
36+
)
37+
end
38+
39+
def broker_client_timeout_plus_margin
40+
(config.get(:broker_client_timeout_seconds).to_i * 1.1).round
41+
end
42+
43+
def job_name_in_configuration
44+
:service_operations_initial_cleanup
45+
end
46+
47+
def max_attempts
48+
1
49+
end
50+
51+
def config
52+
@config ||= Config.config
53+
end
54+
55+
def orphan_mitigator
56+
@orphan_mitigator ||= VCAP::Services::ServiceBrokers::V2::OrphanMitigator.new
57+
end
58+
59+
def logger
60+
@logger ||= Steno.logger('cc.background.service-operations-initial-cleanup')
61+
end
62+
end
63+
end
64+
end
65+
end

config/cloud_controller.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ failed_jobs:
4747
cutoff_age_in_days: 31
4848
frequency_in_seconds: 144000 #4h
4949

50+
service_operations_initial_cleanup:
51+
frequency_in_seconds: 300
52+
5053
completed_tasks:
5154
cutoff_age_in_days: 31
5255

lib/cloud_controller/clock/scheduler.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ class Scheduler
2222
FREQUENTS = [
2323
{ name: 'pending_droplets', class: Jobs::Runtime::PendingDropletCleanup },
2424
{ name: 'pending_builds', class: Jobs::Runtime::PendingBuildCleanup },
25-
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup }
25+
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup },
26+
{ name: 'service_operations_initial_cleanup', class: Jobs::Runtime::ServiceOperationsInitialCleanup }
2627
].freeze
2728

2829
def initialize(config)

lib/cloud_controller/config_schemas/base/clock_schema.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ class ClockSchema < VCAP::Config
2626
optional(:max_number_of_failed_delayed_jobs) => Integer,
2727
frequency_in_seconds: Integer
2828
},
29+
service_operations_initial_cleanup: {
30+
frequency_in_seconds: Integer
31+
},
2932
completed_tasks: {
3033
cutoff_age_in_days: Integer
3134
},

lib/cloud_controller/jobs.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
require 'jobs/runtime/expired_orphaned_blob_cleanup'
2727
require 'jobs/runtime/expired_resource_cleanup'
2828
require 'jobs/runtime/failed_jobs_cleanup'
29+
require 'jobs/runtime/service_operations_initial_cleanup'
2930
require 'jobs/runtime/legacy_jobs'
3031
require 'jobs/runtime/model_deletion'
3132
require 'jobs/runtime/orphaned_blobs_cleanup'

lib/tasks/jobs.rake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ namespace :jobs do
3434
'app_usage_events',
3535
'audit_events',
3636
'failed_jobs',
37+
'service_operations_initial_cleanup',
3738
'service_usage_events',
3839
'completed_tasks',
3940
'expired_blob_cleanup',
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
require 'spec_helper'
2+
3+
module VCAP::CloudController
4+
module Jobs::Runtime
5+
RSpec.describe ServiceOperationsInitialCleanup, job_context: :worker do
6+
subject(:job) { ServiceOperationsInitialCleanup.new }
7+
8+
let(:fake_orphan_mitigator) { double(VCAP::Services::ServiceBrokers::V2::OrphanMitigator) }
9+
let(:fake_logger) { instance_double(Steno::Logger, info: nil) }
10+
11+
it { is_expected.to be_a_valid_job }
12+
13+
it 'knows its job name' do
14+
expect(job.job_name_in_configuration).to equal(:service_operations_initial_cleanup)
15+
end
16+
17+
it 'adds 10% to the broker_client_timeout' do
18+
expect(job.broker_client_timeout_plus_margin).to equal(66)
19+
end
20+
21+
def expect_no_orphan_mitigator_calls(orphan_mitigator)
22+
%i[cleanup_failed_provision cleanup_failed_bind cleanup_failed_key].each do |call|
23+
expect(orphan_mitigator).not_to have_received(call)
24+
end
25+
end
26+
27+
describe 'perform' do
28+
let!(:service_instance) { ManagedServiceInstance.make }
29+
let!(:service_binding) { ServiceBinding.make }
30+
let!(:service_key) { ServiceKey.make }
31+
let!(:route_binding) { RouteBinding.make }
32+
33+
before do
34+
allow(Steno).to receive(:logger).and_return(fake_logger)
35+
36+
allow(VCAP::Services::ServiceBrokers::V2::OrphanMitigator).to receive(:new).and_return(fake_orphan_mitigator)
37+
allow(fake_orphan_mitigator).to receive(:cleanup_failed_provision).with(service_instance)
38+
allow(fake_orphan_mitigator).to receive(:cleanup_failed_bind).with(service_binding)
39+
allow(fake_orphan_mitigator).to receive(:cleanup_failed_key).with(service_key)
40+
end
41+
42+
context 'when there are no service instance operations in state create/initial' do
43+
let!(:service_instance_operation) { ServiceInstanceOperation.make(service_instance_id: service_instance.id, type: 'create', state: 'succeeded') }
44+
45+
it 'does nothing' do
46+
job.perform
47+
48+
expect(service_instance_operation.reload.type).to eq('create')
49+
expect(service_instance_operation.reload.state).to eq('succeeded')
50+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
51+
end
52+
end
53+
54+
context 'when there is one service instance operation in state create/initial' do
55+
let!(:service_instance_operation) { ServiceInstanceOperation.make(service_instance_id: service_instance.id, type: 'create', state: 'initial') }
56+
57+
context 'and the service broker connection timeout has not yet passed' do
58+
it 'does nothing' do
59+
job.perform
60+
61+
expect(service_instance_operation.reload.type).to eq('create')
62+
expect(service_instance_operation.reload.state).to eq('initial')
63+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
64+
end
65+
end
66+
67+
context 'and the service broker connection timeout has passed' do
68+
let(:expired_time) { Time.now.utc - 2.seconds }
69+
70+
before do
71+
TestConfig.override(broker_client_timeout_seconds: 1)
72+
service_instance_operation.this.update(updated_at: expired_time)
73+
end
74+
75+
it 'sets the operation state to create/failed and triggers the orphan mitigation' do
76+
job.perform
77+
78+
expect(service_instance_operation.reload.type).to eq('create')
79+
expect(service_instance_operation.reload.state).to eq('failed')
80+
expect(service_instance_operation.reload.description).to eq('Operation was stuck in "initial" state. Set to "failed" by cleanup job.')
81+
expect(fake_logger).to have_received(:info).with("ServiceInstance #{service_instance[:guid]} is stuck in state 'create'/'initial'. " \
82+
"Setting state to 'failed' and triggering orphan mitigation.")
83+
expect(fake_orphan_mitigator).to have_received(:cleanup_failed_provision).with(service_instance)
84+
end
85+
end
86+
end
87+
88+
context 'when there are no service instance binding operations in state create/initial' do
89+
let!(:service_binding_operation) { ServiceBindingOperation.make(service_binding_id: service_binding.id, type: 'create', state: 'succeeded') }
90+
91+
it 'does nothing' do
92+
job.perform
93+
94+
expect(service_binding_operation.reload.type).to eq('create')
95+
expect(service_binding_operation.reload.state).to eq('succeeded')
96+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
97+
end
98+
end
99+
100+
context 'when there is one service instance binding operation in state create/initial' do
101+
let!(:service_binding_operation) { ServiceBindingOperation.make(service_binding_id: service_binding.id, type: 'create', state: 'initial') }
102+
103+
context 'and the service broker connection timeout has not yet passed' do
104+
it 'does nothing' do
105+
job.perform
106+
107+
expect(service_binding_operation.reload.type).to eq('create')
108+
expect(service_binding_operation.reload.state).to eq('initial')
109+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
110+
end
111+
end
112+
113+
context 'and the service broker connection timeout has passed' do
114+
let(:expired_time) { Time.now.utc - 2.seconds }
115+
116+
before do
117+
TestConfig.override(broker_client_timeout_seconds: 1)
118+
service_binding_operation.this.update(updated_at: expired_time)
119+
end
120+
121+
it 'sets the operation state to create/failed and triggers the orphan mitigation' do
122+
job.perform
123+
124+
expect(service_binding_operation.reload.type).to eq('create')
125+
expect(service_binding_operation.reload.state).to eq('failed')
126+
expect(service_binding_operation.reload.description).to eq('Operation was stuck in "initial" state. Set to "failed" by cleanup job.')
127+
expect(fake_logger).to have_received(:info).with("ServiceBinding #{service_binding[:guid]} is stuck in state 'create'/'initial'. " \
128+
"Setting state to 'failed' and triggering orphan mitigation.")
129+
expect(fake_orphan_mitigator).to have_received(:cleanup_failed_bind).with(service_binding)
130+
end
131+
end
132+
end
133+
134+
context 'when there are no service key operations in state create/initial' do
135+
let!(:service_key_operation) { ServiceKeyOperation.make(service_key_id: service_key.id, type: 'create', state: 'succeeded') }
136+
137+
it 'does nothing' do
138+
job.perform
139+
140+
expect(service_key_operation.reload.type).to eq('create')
141+
expect(service_key_operation.reload.state).to eq('succeeded')
142+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
143+
end
144+
end
145+
146+
context 'when there is one service key operation in state create/initial' do
147+
let!(:service_key_operation) { ServiceKeyOperation.make(service_key_id: service_key.id, type: 'create', state: 'initial') }
148+
149+
context 'and the service broker connection timeout has not yet passed' do
150+
it 'does nothing' do
151+
job.perform
152+
153+
expect(service_key_operation.reload.type).to eq('create')
154+
expect(service_key_operation.reload.state).to eq('initial')
155+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
156+
end
157+
end
158+
159+
context 'and the service broker connection timeout has passed' do
160+
let(:expired_time) { Time.now.utc - 2.seconds }
161+
162+
before do
163+
TestConfig.override(broker_client_timeout_seconds: 1)
164+
service_key_operation.this.update(updated_at: expired_time)
165+
end
166+
167+
it 'sets the operation state to create/failed and triggers the orphan mitigation' do
168+
job.perform
169+
170+
expect(service_key_operation.reload.type).to eq('create')
171+
expect(service_key_operation.reload.state).to eq('failed')
172+
expect(service_key_operation.reload.description).to eq('Operation was stuck in "initial" state. Set to "failed" by cleanup job.')
173+
expect(fake_logger).to have_received(:info).with("ServiceKey #{service_key[:guid]} is stuck in state 'create'/'initial'. " \
174+
"Setting state to 'failed' and triggering orphan mitigation.")
175+
expect(fake_orphan_mitigator).to have_received(:cleanup_failed_key).with(service_key)
176+
end
177+
end
178+
end
179+
180+
context 'when there are no route binding operations in state create/initial' do
181+
let!(:route_binding_operation) { RouteBindingOperation.make(route_binding_id: route_binding.id, type: 'create', state: 'succeeded') }
182+
183+
it 'does nothing' do
184+
job.perform
185+
186+
expect(route_binding_operation.reload.type).to eq('create')
187+
expect(route_binding_operation.reload.state).to eq('succeeded')
188+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
189+
end
190+
end
191+
192+
context 'when there is one route binding operation in state create/initial' do
193+
let!(:route_binding_operation) { RouteBindingOperation.make(route_binding_id: route_binding.id, type: 'create', state: 'initial') }
194+
195+
context 'and the service broker connection timeout has not yet passed' do
196+
it 'does nothing' do
197+
job.perform
198+
199+
expect(route_binding_operation.reload.type).to eq('create')
200+
expect(route_binding_operation.reload.state).to eq('initial')
201+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
202+
end
203+
end
204+
205+
context 'and the service broker connection timeout has passed' do
206+
let(:expired_time) { Time.now.utc - 2.seconds }
207+
208+
before do
209+
TestConfig.override(broker_client_timeout_seconds: 1)
210+
route_binding_operation.this.update(updated_at: expired_time)
211+
end
212+
213+
it 'sets the operation state to create/failed and triggers the orphan mitigation' do
214+
job.perform
215+
216+
expect(route_binding_operation.reload.type).to eq('create')
217+
expect(route_binding_operation.reload.state).to eq('failed')
218+
expect(route_binding_operation.reload.description).to eq('Operation was stuck in "initial" state. Set to "failed" by cleanup job.')
219+
expect(fake_logger).to have_received(:info).with("RouteBinding #{route_binding[:guid]} is stuck in state 'create'/'initial'. " \
220+
"Setting state to 'failed' and triggering orphan mitigation.")
221+
expect_no_orphan_mitigator_calls(fake_orphan_mitigator)
222+
end
223+
end
224+
end
225+
end
226+
end
227+
end
228+
end

spec/unit/lib/cloud_controller/clock/scheduler_spec.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ module VCAP::CloudController
1919
app_usage_events: { cutoff_age_in_days: 1 },
2020
audit_events: { cutoff_age_in_days: 3 },
2121
failed_jobs: { frequency_in_seconds: 400, cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10 },
22+
service_operations_initial_cleanup: { frequency_in_seconds: 600 },
2223
service_usage_events: { cutoff_age_in_days: 5 },
2324
completed_tasks: { cutoff_age_in_days: 6 },
2425
pending_droplets: { frequency_in_seconds: 300, expiration_in_seconds: 600 },
@@ -153,6 +154,12 @@ module VCAP::CloudController
153154
expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup)
154155
end
155156

157+
expect(clock).to receive(:schedule_frequent_worker_job) do |args, &block|
158+
expect(args).to eql(name: 'service_operations_initial_cleanup', interval: 600)
159+
expect(Jobs::Runtime::ServiceOperationsInitialCleanup).to receive(:new).and_call_original
160+
expect(block.call).to be_instance_of(Jobs::Runtime::ServiceOperationsInitialCleanup)
161+
end
162+
156163
schedule.start
157164
end
158165

0 commit comments

Comments
 (0)