Skip to content

Commit 6a46d33

Browse files
authored
Merge pull request #188 from Shopify/cbruckmayer/worker-crash
Exit early if all workers die
2 parents ff854dd + dae8d9b commit 6a46d33

File tree

5 files changed

+83
-7
lines changed

5 files changed

+83
-7
lines changed

ruby/lib/ci/queue/configuration.rb

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ class Configuration
88
attr_accessor :max_test_failed, :redis_ttl
99
attr_reader :circuit_breakers
1010
attr_writer :seed, :build_id
11-
attr_writer :queue_init_timeout
11+
attr_writer :queue_init_timeout, :report_timeout, :inactive_workers_timeout
1212

1313
class << self
1414
def from_env(env)
@@ -35,7 +35,7 @@ def initialize(
3535
namespace: nil, seed: nil, flaky_tests: [], statsd_endpoint: nil, max_consecutive_failures: nil,
3636
grind_count: nil, max_duration: nil, failure_file: nil, max_test_duration: nil,
3737
max_test_duration_percentile: 0.5, track_test_duration: false, max_test_failed: nil,
38-
queue_init_timeout: nil, redis_ttl: 8 * 60 * 60
38+
queue_init_timeout: nil, redis_ttl: 8 * 60 * 60, report_timeout: nil, inactive_workers_timeout: nil
3939
)
4040
@build_id = build_id
4141
@circuit_breakers = [CircuitBreaker::Disabled]
@@ -57,12 +57,22 @@ def initialize(
5757
self.max_consecutive_failures = max_consecutive_failures
5858
self.max_duration = max_duration
5959
@redis_ttl = redis_ttl
60+
@report_timeout = report_timeout
61+
@inactive_workers_timeout = inactive_workers_timeout
6062
end
6163

6264
def queue_init_timeout
6365
@queue_init_timeout || timeout
6466
end
6567

68+
def report_timeout
69+
@report_timeout || timeout
70+
end
71+
72+
def inactive_workers_timeout
73+
@inactive_workers_timeout || timeout
74+
end
75+
6676
def max_consecutive_failures=(max)
6777
if max
6878
@circuit_breakers << CircuitBreaker::MaxConsecutiveFailures.new(max_consecutive_failures: max)

ruby/lib/ci/queue/redis/supervisor.rb

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,33 @@ def wait_for_workers
2121

2222
yield if block_given?
2323

24-
time_left = config.timeout
25-
until exhausted? || time_left <= 0 || max_test_failed?
26-
sleep 1
24+
time_left = config.report_timeout
25+
time_left_with_no_workers = config.inactive_workers_timeout
26+
until exhausted? || time_left <= 0 || max_test_failed? || time_left_with_no_workers <= 0
2727
time_left -= 1
28+
sleep 1
29+
30+
if active_workers?
31+
time_left_with_no_workers = config.inactive_workers_timeout
32+
else
33+
time_left_with_no_workers -= 1
34+
end
2835

2936
yield if block_given?
3037
end
38+
39+
puts "Aborting, it seems all workers died." if time_left_with_no_workers <= 0
3140
exhausted?
3241
rescue CI::Queue::Redis::LostMaster
3342
false
3443
end
44+
45+
private
46+
47+
def active_workers?
48+
# if there are running jobs we assume there are still agents active
49+
redis.zrangebyscore(key('running'), Time.now.to_f - config.timeout, "+inf", limit: [0,1]).count > 0
50+
end
3551
end
3652
end
3753
end

ruby/lib/minitest/queue/runner.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,24 @@ def parser
370370
queue_config.timeout = timeout
371371
end
372372

373+
help = <<~EOS
374+
Specify a timeout after which the report command will fail if not all tests have been processed.
375+
Defaults to the value set for --timeout.
376+
EOS
377+
opts.separator ""
378+
opts.on('--report-timeout TIMEOUT', Float, help) do |timeout|
379+
queue_config.report_timeout = timeout
380+
end
381+
382+
help = <<~EOS
383+
Specify a timeout after the report will fail if all workers are inactive (e.g. died).
384+
Defaults to the value set for --timeout.
385+
EOS
386+
opts.separator ""
387+
opts.on('--inactive-workers-timeout TIMEOUT', Float, help) do |timeout|
388+
queue_config.inactive_workers_timeout = timeout
389+
end
390+
373391
help = <<~EOS
374392
Specify a timeout to elect the leader and populate the queue.
375393
Defaults to the value set for --timeout.

ruby/test/ci/queue/configuration_test.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,5 +112,21 @@ def test_queue_init_timeout_set
112112

113113
assert_equal 45, config.queue_init_timeout
114114
end
115+
116+
def test_report_timeout_unset_timeout_set
117+
config = Configuration.from_env({})
118+
config.timeout = 120
119+
120+
assert_equal config.timeout, config.report_timeout
121+
end
122+
123+
def test_report_timeout_set
124+
config = Configuration.from_env({})
125+
config.report_timeout = 45
126+
config.timeout = 120
127+
128+
assert_equal 45, config.report_timeout
129+
end
130+
115131
end
116132
end

ruby/test/ci/queue/redis_supervisor_test.rb

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,18 @@ def test_wait_for_workers
3939
assert_equal true, workers_done
4040
end
4141

42+
def test_wait_for_workers_timeout
43+
@supervisor = supervisor(timeout: 10, queue_init_timeout: 0.1)
44+
io = nil
45+
thread = Thread.start do
46+
io = capture_io { @supervisor.wait_for_workers }
47+
end
48+
thread.wakeup
49+
worker(1)
50+
thread.join
51+
assert_includes io, "Aborting, it seems all workers died.\n"
52+
end
53+
4254
def test_num_workers
4355
assert_equal 0, @supervisor.workers_count
4456
worker(1)
@@ -58,10 +70,14 @@ def worker(id)
5870
).populate(SharedQueueAssertions::TEST_LIST)
5971
end
6072

61-
def supervisor
73+
def supervisor(timeout: 30, queue_init_timeout: nil)
6274
CI::Queue::Redis::Supervisor.new(
6375
@redis_url,
64-
CI::Queue::Configuration.new(build_id: '42'),
76+
CI::Queue::Configuration.new(
77+
build_id: '42',
78+
timeout: timeout,
79+
queue_init_timeout: queue_init_timeout
80+
),
6581
)
6682
end
6783
end

0 commit comments

Comments
 (0)