Skip to content

Commit 1c23a18

Browse files
committed
Improve flaky tests across the board
1 parent 6a6f1c4 commit 1c23a18

File tree

8 files changed

+50
-32
lines changed

8 files changed

+50
-32
lines changed

test/integration/async_processes_lifecycle_test.rb

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ class AsyncProcessesLifecycleTest < ActiveSupport::TestCase
3434
no_pause = enqueue_store_result_job("no pause")
3535
pause = enqueue_store_result_job("pause", pause: 3.second)
3636

37-
signal_process(@pid, :KILL, wait: 0.2.seconds)
38-
wait_for_jobs_to_finish_for(2.seconds)
37+
# Wait for the "no pause" job to complete before sending KILL
38+
wait_for_jobs_to_finish_for(2.seconds, except: pause)
39+
40+
signal_process(@pid, :KILL, wait: 0.1.seconds)
3941
wait_for_registered_processes(1, timeout: 2.second)
4042

4143
assert_not process_exists?(@pid)
@@ -123,27 +125,35 @@ class AsyncProcessesLifecycleTest < ActiveSupport::TestCase
123125
no_pause = enqueue_store_result_job("no pause")
124126
pause = enqueue_store_result_job("pause", pause: SolidQueue.shutdown_timeout + 10.second)
125127

126-
wait_while_with_timeout(1.second) { SolidQueue::ReadyExecution.count > 1 }
128+
# Wait for the "no pause" job to complete and the pause job to be claimed.
129+
# This ensures the pause job is actively being processed.
130+
wait_for_jobs_to_finish_for(3.seconds, except: pause)
131+
wait_for(timeout: 2.seconds) { SolidQueue::ClaimedExecution.exists?(job_id: SolidQueue::Job.find_by(active_job_id: pause.job_id)&.id) }
127132

128-
signal_process(@pid, :TERM, wait: 0.5.second)
133+
signal_process(@pid, :TERM, wait: 0.2.second)
129134
wait_for_jobs_to_finish_for(2.seconds, except: pause)
130135

131-
# exit! exits with status 1 by default
132-
wait_for_process_termination_with_timeout(@pid, timeout: SolidQueue.shutdown_timeout + 5.seconds, exitstatus: 1)
136+
# Wait for process to terminate. In async mode, shutdown_timeout is used by both
137+
# the supervisor and workers, creating a race: exit status may be 0 (graceful) or
138+
# 1 (exit!) depending on which timeout check happens first.
139+
wait_for_process_termination_with_timeout(@pid, timeout: SolidQueue.shutdown_timeout + 5.seconds, exitstatus: nil)
133140
assert_not process_exists?(@pid)
134141

135142
assert_completed_job_results("no pause")
136143
assert_job_status(no_pause, :finished)
137144

138-
# When timeout is exceeded, exit! is called without cleanup.
139-
# The in-flight job stays claimed and processes stay registered.
140-
# A future supervisor will need to prune and fail these orphaned executions.
145+
# The pause job should have started but not completed
141146
assert_started_job_result("pause")
142-
assert_job_status(pause, :claimed)
143-
144-
assert_registered_supervisor
145-
assert find_processes_registered_as("Worker").any? { |w| w.metadata["queues"].include?("background") }
146-
assert_claimed_jobs
147+
assert_not_equal "completed", skip_active_record_query_cache { JobResult.find_by(value: "pause")&.status }
148+
149+
# After shutdown, the pause job may be either:
150+
# - claimed (exit! called, no cleanup) OR
151+
# - ready (graceful exit, job released back to queue)
152+
# Both are valid outcomes depending on the timing race between supervisor and worker timeouts.
153+
skip_active_record_query_cache do
154+
job = SolidQueue::Job.find_by(active_job_id: pause.job_id)
155+
assert job.claimed? || job.ready?, "Expected pause job to be claimed or ready, but was neither"
156+
end
147157
end
148158

149159
test "process some jobs that raise errors" do

test/integration/forked_processes_lifecycle_test.rb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class ForkedProcessesLifecycleTest < ActiveSupport::TestCase
6464

6565
test "quit supervisor while there are jobs in-flight" do
6666
no_pause = enqueue_store_result_job("no pause")
67-
pause = enqueue_store_result_job("pause", pause: 1.second)
67+
# long enough pause to make sure it doesn't finish
68+
pause = enqueue_store_result_job("pause", pause: 60.second)
6869

6970
wait_while_with_timeout(1.second) { SolidQueue::ReadyExecution.count > 0 }
7071

@@ -87,7 +88,7 @@ class ForkedProcessesLifecycleTest < ActiveSupport::TestCase
8788

8889
test "term supervisor while there are jobs in-flight" do
8990
no_pause = enqueue_store_result_job("no pause")
90-
pause = enqueue_store_result_job("pause", pause: 0.2.seconds)
91+
pause = enqueue_store_result_job("pause", pause: 1.second)
9192

9293
signal_process(@pid, :TERM, wait: 0.3.second)
9394
wait_for_jobs_to_finish_for(3.seconds)
@@ -104,10 +105,10 @@ class ForkedProcessesLifecycleTest < ActiveSupport::TestCase
104105

105106
test "int supervisor while there are jobs in-flight" do
106107
no_pause = enqueue_store_result_job("no pause")
107-
pause = enqueue_store_result_job("pause", pause: 0.2.seconds)
108+
pause = enqueue_store_result_job("pause", pause: 1.second)
108109

109110
signal_process(@pid, :INT, wait: 0.3.second)
110-
wait_for_jobs_to_finish_for(2.second)
111+
wait_for_jobs_to_finish_for(3.second)
111112

112113
assert_completed_job_results("no pause")
113114
assert_completed_job_results("pause")

test/integration/instrumentation_test.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ class InstrumentationTest < ActiveSupport::TestCase
353353

354354
events = subscribed("enqueue_recurring_task.solid_queue") do
355355
scheduler.start
356-
sleep(1.01)
356+
sleep(1.5)
357357
scheduler.stop
358358
end
359359

@@ -375,7 +375,7 @@ class InstrumentationTest < ActiveSupport::TestCase
375375

376376
events = subscribed("enqueue_recurring_task.solid_queue") do
377377
scheduler.start
378-
sleep(1.01)
378+
sleep(1.5)
379379
scheduler.stop
380380
end
381381

test/test_helper.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def destroy_records
4040
SolidQueue::RecurringTask.delete_all
4141
SolidQueue::ScheduledExecution.delete_all
4242
SolidQueue::ReadyExecution.delete_all
43+
SolidQueue::ClaimedExecution.delete_all
44+
SolidQueue::FailedExecution.delete_all
4345
JobResult.delete_all
4446
end
4547

test/test_helpers/processes_test_helper.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def wait_for_process_termination_with_timeout(pid, timeout: 10, exitstatus: 0, s
7070
if process_exists?(pid)
7171
begin
7272
status = Process.waitpid2(pid).last
73-
assert_equal exitstatus, status.exitstatus, "Expected pid #{pid} to exit with status #{exitstatus}" if status.exitstatus
73+
assert_equal exitstatus, status.exitstatus, "Expected pid #{pid} to exit with status #{exitstatus}" if status.exitstatus && !exitstatus.nil?
7474
assert_equal signaled, Signal.list.key(status.termsig).to_sym, "Expected pid #{pid} to be terminated with signal #{signaled}" if status.termsig
7575
rescue Errno::ECHILD
7676
# Child pid already reaped

test/unit/async_supervisor_test.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ class AsyncSupervisorTest < ActiveSupport::TestCase
55

66
test "start as non-standalone" do
77
supervisor = run_supervisor_as_thread
8-
wait_for_registered_processes(4)
8+
wait_for_registered_processes(4, timeout: 5.seconds)
99

1010
assert_registered_processes(kind: "Supervisor(async)")
1111
assert_registered_processes(kind: "Worker", supervisor_id: supervisor.process_id, count: 2)
@@ -18,7 +18,7 @@ class AsyncSupervisorTest < ActiveSupport::TestCase
1818

1919
test "start standalone" do
2020
pid = run_supervisor_as_fork(mode: :async)
21-
wait_for_registered_processes(4)
21+
wait_for_registered_processes(4, timeout: 5.seconds)
2222

2323
assert_registered_processes(kind: "Supervisor(async)")
2424
assert_registered_processes(kind: "Worker", supervisor_pid: pid, count: 2)
@@ -30,7 +30,7 @@ class AsyncSupervisorTest < ActiveSupport::TestCase
3030

3131
test "start as non-standalone with provided configuration" do
3232
supervisor = run_supervisor_as_thread(workers: [], dispatchers: [ { batch_size: 100 } ])
33-
wait_for_registered_processes(2) # supervisor + dispatcher
33+
wait_for_registered_processes(2, timeout: 5.seconds) # supervisor + dispatcher
3434

3535
assert_registered_processes(kind: "Supervisor(async)")
3636
assert_registered_processes(kind: "Worker", count: 0)
@@ -50,7 +50,7 @@ class AsyncSupervisorTest < ActiveSupport::TestCase
5050
}
5151

5252
supervisor = run_supervisor_as_thread(**config)
53-
wait_for_registered_processes(2) # supervisor + 1 worker
53+
wait_for_registered_processes(2, timeout: 5.seconds) # supervisor + 1 worker
5454
assert_registered_processes(kind: "Supervisor(async)")
5555

5656
wait_while_with_timeout(1.second) { SolidQueue::ClaimedExecution.count > 0 }
@@ -72,7 +72,7 @@ class AsyncSupervisorTest < ActiveSupport::TestCase
7272
}
7373

7474
pid = run_supervisor_as_fork(mode: :async, **config)
75-
wait_for_registered_processes(2) # supervisor + 1 worker
75+
wait_for_registered_processes(2, timeout: 5.seconds) # supervisor + 1 worker
7676
assert_registered_processes(kind: "Supervisor(async)")
7777

7878
wait_while_with_timeout(1.second) { SolidQueue::ClaimedExecution.count > 0 }

test/unit/process_recovery_test.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ class ProcessRecoveryTest < ActiveSupport::TestCase
2323
supervisor_process = SolidQueue::Process.find_by(kind: "Supervisor(fork)", pid: @pid)
2424
assert supervisor_process
2525

26-
worker_process = SolidQueue::Process.find_by(kind: "Worker")
26+
# Find the worker supervised by this specific supervisor to avoid interference from other tests
27+
worker_process = SolidQueue::Process.find_by(kind: "Worker", supervisor_id: supervisor_process.id)
2728
assert worker_process
2829

2930
# Enqueue a job and wait for it to be claimed

test/unit/scheduler_test.rb

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,17 @@ class SchedulerTest < ActiveSupport::TestCase
2424
end
2525

2626
schedulers.each(&:start)
27-
wait_while_with_timeout(2.5.seconds) { SolidQueue::RecurringExecution.count != 2 }
27+
wait_while_with_timeout(3.seconds) { SolidQueue::RecurringExecution.count < 2 }
2828
schedulers.each(&:stop)
2929

30-
assert_equal SolidQueue::Job.count, SolidQueue::RecurringExecution.count
31-
run_at_times = SolidQueue::RecurringExecution.all.map(&:run_at).sort
32-
0.upto(run_at_times.length - 2) do |i|
33-
assert_equal 1, run_at_times[i + 1] - run_at_times[i]
30+
skip_active_record_query_cache do
31+
assert SolidQueue::RecurringExecution.count >= 2, "Expected at least 2 recurring executions, got #{SolidQueue::RecurringExecution.count}"
32+
assert_equal SolidQueue::Job.count, SolidQueue::RecurringExecution.count
33+
run_at_times = SolidQueue::RecurringExecution.all.map(&:run_at).sort
34+
0.upto(run_at_times.length - 2) do |i|
35+
time_diff = run_at_times[i + 1] - run_at_times[i]
36+
assert_in_delta 1, time_diff, 0.001, "Expected run_at times to be 1 second apart, got #{time_diff}. All run_at times: #{run_at_times.inspect}"
37+
end
3438
end
3539
end
3640
end

0 commit comments

Comments
 (0)