Skip to content

Commit 00d801b

Browse files
committed
Survive minor redis connection drops
If an exception was raised when trying to acquire to lock, the process would end up in a half-dead state where it would stay "up", but would not attempt to acquire the lock again when the connection came back up.
1 parent 25f2af6 commit 00d801b

File tree

2 files changed

+70
-3
lines changed

2 files changed

+70
-3
lines changed

lib/dynflow/executors/sidekiq/redis_locking.rb

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ def release_orchestrator_lock
4141
def wait_for_orchestrator_lock
4242
mode = nil
4343
loop do
44-
active = ::Sidekiq.redis do |conn|
45-
conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true)
46-
end
44+
active = try_acquire_orchestrator_lock
4745
break if active
4846
if mode.nil?
4947
mode = :passive
@@ -54,6 +52,15 @@ def wait_for_orchestrator_lock
5452
@logger.info('Acquired orchestrator lock, entering active mode.')
5553
end
5654

55+
def try_acquire_orchestrator_lock
56+
::Sidekiq.redis do |conn|
57+
conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true)
58+
end
59+
rescue ::Redis::BaseError => e
60+
@logger.error("Could not acquire orchestrator lock: #{e}")
61+
nil
62+
end
63+
5764
def reacquire_orchestrator_lock
5865
case ::Sidekiq.redis { |conn| conn.eval REACQUIRE_SCRIPT, [REDIS_LOCK_KEY], [@world.id] }
5966
when ACQUIRE_MISSING

test/bats/sidekiq-orchestrator.bats

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,63 @@ teardown() {
116116
timeout 30 bundle exec ruby examples/remote_executor.rb client 1
117117
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)"
118118
}
119+
120+
@test "active orchestrator can survive a brief redis connection drop" {
121+
cd "$(get_project_root)"
122+
123+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
124+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
125+
126+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
127+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
128+
129+
stop_redis
130+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
131+
start_redis
132+
133+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
134+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)"
135+
}
136+
137+
@test "active orchestrator can survive a longer redis connection drop" {
138+
cd "$(get_project_root)"
139+
140+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
141+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
142+
143+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
144+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
145+
146+
stop_redis 1
147+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
148+
start_redis
149+
150+
wait_for 30 1 grep 'The orchestrator lock was lost, reacquired' "$(bg_output_file o1)"
151+
152+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
153+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)"
154+
}
155+
156+
@test "orchestrators can fail over if active one goes away during downtime" {
157+
cd "$(get_project_root)"
158+
159+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
160+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
161+
162+
run_background 'o2' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
163+
wait_for 30 1 grep 'dynflow: Orchestrator lock already taken, entering passive mode.' "$(bg_output_file o2)"
164+
165+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
166+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
167+
168+
stop_redis 1
169+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
170+
kill -15 "$(cat "$TEST_PIDDIR/o1.pid")"
171+
start_redis
172+
173+
wait_for 120 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o2)"
174+
wait_for 120 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o2)"
175+
176+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
177+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o2)"
178+
}

0 commit comments

Comments
 (0)