Skip to content

Commit c5e9e01

Browse files
committed
Survive minor redis connection drops
If an exception was raised when trying to acquire to lock, the process would end up in a half-dead state where it would stay "up", but would not attempt to acquire the lock again when the connection came back up.
1 parent d304336 commit c5e9e01

File tree

2 files changed

+70
-3
lines changed

2 files changed

+70
-3
lines changed

lib/dynflow/executors/sidekiq/redis_locking.rb

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ def release_orchestrator_lock
4141
def wait_for_orchestrator_lock
4242
mode = nil
4343
loop do
44-
active = ::Sidekiq.redis do |conn|
45-
conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true)
46-
end
44+
active = try_acquire_orchestrator_lock
4745
break if active
4846
if mode.nil?
4947
mode = :passive
@@ -54,6 +52,15 @@ def wait_for_orchestrator_lock
5452
@logger.info('Acquired orchestrator lock, entering active mode.')
5553
end
5654

55+
def try_acquire_orchestrator_lock
56+
::Sidekiq.redis do |conn|
57+
conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true)
58+
end
59+
rescue ::Redis::BaseError => e
60+
@logger.error("Could not acquire orchestrator lock: #{e}")
61+
nil
62+
end
63+
5764
def reacquire_orchestrator_lock
5865
case ::Sidekiq.redis { |conn| conn.eval REACQUIRE_SCRIPT, [REDIS_LOCK_KEY], [@world.id] }
5966
when ACQUIRE_MISSING

test/bats/sidekiq-orchestrator.bats

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,63 @@ teardown() {
8585
kill -15 "$(cat "$TEST_PIDDIR/o1.pid")"
8686
wait_for 120 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o2)"
8787
}
88+
89+
@test "active orchestrator can survive a brief redis connection drop" {
90+
cd "$(get_project_root)"
91+
92+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
93+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
94+
95+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
96+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
97+
98+
stop_redis
99+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
100+
start_redis
101+
102+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
103+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)"
104+
}
105+
106+
@test "active orchestrator can survive a longer redis connection drop" {
107+
cd "$(get_project_root)"
108+
109+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
110+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
111+
112+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
113+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
114+
115+
stop_redis 1
116+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
117+
start_redis
118+
119+
wait_for 30 1 grep 'The orchestrator lock was lost, reacquired' "$(bg_output_file o1)"
120+
121+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
122+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)"
123+
}
124+
125+
@test "orchestrators can fail over if active one goes away during downtime" {
126+
cd "$(get_project_root)"
127+
128+
run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
129+
wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)"
130+
131+
run_background 'o2' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
132+
wait_for 30 1 grep 'dynflow: Orchestrator lock already taken, entering passive mode.' "$(bg_output_file o2)"
133+
134+
run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default
135+
wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)"
136+
137+
stop_redis 1
138+
wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)"
139+
kill -15 "$(cat "$TEST_PIDDIR/o1.pid")"
140+
start_redis
141+
142+
wait_for 120 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o2)"
143+
wait_for 120 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o2)"
144+
145+
timeout 10 bundle exec ruby examples/remote_executor.rb client 1
146+
wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o2)"
147+
}

0 commit comments

Comments
 (0)