|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +require "test_helper" |
| 4 | + |
| 5 | +class AsyncProcessesLifecycleTest < ActiveSupport::TestCase |
| 6 | + self.use_transactional_tests = false |
| 7 | + |
| 8 | + setup do |
| 9 | + config_as_hash = { workers: [ { queues: :background }, { queues: :default, threads: 5 } ], dispatchers: [] } |
| 10 | + @pid = run_supervisor_as_fork(load_configuration_from: config_as_hash, mode: :async) |
| 11 | + |
| 12 | + wait_for_registered_processes(3, timeout: 3.second) |
| 13 | + assert_registered_workers_for(:background, :default, supervisor_pid: @pid) |
| 14 | + end |
| 15 | + |
| 16 | + teardown do |
| 17 | + terminate_process(@pid) if process_exists?(@pid) |
| 18 | + end |
| 19 | + |
| 20 | + test "enqueue jobs in multiple queues" do |
| 21 | + 6.times { |i| enqueue_store_result_job("job_#{i}") } |
| 22 | + 6.times { |i| enqueue_store_result_job("job_#{i}", :default) } |
| 23 | + |
| 24 | + wait_for_jobs_to_finish_for(2.seconds) |
| 25 | + |
| 26 | + assert_equal 12, JobResult.count |
| 27 | + 6.times { |i| assert_completed_job_results("job_#{i}", :background) } |
| 28 | + 6.times { |i| assert_completed_job_results("job_#{i}", :default) } |
| 29 | + |
| 30 | + terminate_process(@pid) |
| 31 | + assert_clean_termination |
| 32 | + end |
| 33 | + |
| 34 | + test "kill supervisor while there are jobs in-flight" do |
| 35 | + no_pause = enqueue_store_result_job("no pause") |
| 36 | + pause = enqueue_store_result_job("pause", pause: 0.2.seconds) |
| 37 | + |
| 38 | + signal_process(@pid, :KILL, wait: 0.15.seconds) |
| 39 | + wait_for_jobs_to_finish_for(2.seconds) |
| 40 | + wait_for_registered_processes(1, timeout: 3.second) |
| 41 | + |
| 42 | + assert_not process_exists?(@pid) |
| 43 | + |
| 44 | + assert_completed_job_results("no pause") |
| 45 | + assert_job_status(no_pause, :finished) |
| 46 | + |
| 47 | + # Nothing had the chance to finish orderly |
| 48 | + assert_registered_supervisor |
| 49 | + assert_registered_workers_for(:background, :default, supervisor_pid: @pid) |
| 50 | + assert_started_job_result("pause") |
| 51 | + assert_claimed_jobs |
| 52 | + end |
| 53 | + |
| 54 | + test "term supervisor multiple times" do |
| 55 | + 5.times do |
| 56 | + signal_process(@pid, :TERM, wait: 0.1.second) |
| 57 | + end |
| 58 | + |
| 59 | + sleep(1.second) |
| 60 | + assert_clean_termination |
| 61 | + end |
| 62 | + |
| 63 | + test "quit supervisor while there are jobs in-flight" do |
| 64 | + no_pause = enqueue_store_result_job("no pause") |
| 65 | + pause = enqueue_store_result_job("pause", pause: 1.second) |
| 66 | + |
| 67 | + signal_process(@pid, :QUIT, wait: 0.4.second) |
| 68 | + wait_for_jobs_to_finish_for(2.seconds) |
| 69 | + |
| 70 | + wait_while_with_timeout(2.seconds) { process_exists?(@pid) } |
| 71 | + assert_not process_exists?(@pid) |
| 72 | + |
| 73 | + assert_no_unfinished_jobs |
| 74 | + assert_clean_termination |
| 75 | + end |
| 76 | + |
| 77 | + test "term supervisor while there are jobs in-flight" do |
| 78 | + no_pause = enqueue_store_result_job("no pause") |
| 79 | + pause = enqueue_store_result_job("pause", pause: 0.2.seconds) |
| 80 | + |
| 81 | + signal_process(@pid, :TERM, wait: 0.3.second) |
| 82 | + wait_for_jobs_to_finish_for(3.seconds) |
| 83 | + |
| 84 | + assert_completed_job_results("no pause") |
| 85 | + assert_completed_job_results("pause") |
| 86 | + |
| 87 | + assert_job_status(no_pause, :finished) |
| 88 | + assert_job_status(pause, :finished) |
| 89 | + |
| 90 | + wait_for_process_termination_with_timeout(@pid, timeout: 1.second) |
| 91 | + assert_clean_termination |
| 92 | + end |
| 93 | + |
| 94 | + test "int supervisor while there are jobs in-flight" do |
| 95 | + no_pause = enqueue_store_result_job("no pause") |
| 96 | + pause = enqueue_store_result_job("pause", pause: 0.2.seconds) |
| 97 | + |
| 98 | + signal_process(@pid, :INT, wait: 0.3.second) |
| 99 | + wait_for_jobs_to_finish_for(2.second) |
| 100 | + |
| 101 | + assert_completed_job_results("no pause") |
| 102 | + assert_completed_job_results("pause") |
| 103 | + |
| 104 | + assert_job_status(no_pause, :finished) |
| 105 | + assert_job_status(pause, :finished) |
| 106 | + |
| 107 | + wait_for_process_termination_with_timeout(@pid, timeout: 1.second) |
| 108 | + assert_clean_termination |
| 109 | + end |
| 110 | + |
| 111 | + test "term supervisor exceeding timeout while there are jobs in-flight" do |
| 112 | + no_pause = enqueue_store_result_job("no pause") |
| 113 | + pause = enqueue_store_result_job("pause", pause: SolidQueue.shutdown_timeout + 10.second) |
| 114 | + |
| 115 | + signal_process(@pid, :TERM, wait: 0.5.second) |
| 116 | + |
| 117 | + sleep(SolidQueue.shutdown_timeout + 0.5.second) |
| 118 | + |
| 119 | + assert_completed_job_results("no pause") |
| 120 | + assert_job_status(no_pause, :finished) |
| 121 | + |
| 122 | + # This job was left claimed as the worker was shutdown without |
| 123 | + # a chance to terminate orderly |
| 124 | + assert_started_job_result("pause") |
| 125 | + assert_job_status(pause, :claimed) |
| 126 | + |
| 127 | + # Now wait until the supervisor finishes for real, which will complete the cleanup |
| 128 | + wait_for_process_termination_with_timeout(@pid, timeout: 1.second) |
| 129 | + assert_clean_termination |
| 130 | + end |
| 131 | + |
| 132 | + test "process some jobs that raise errors" do |
| 133 | + 2.times { enqueue_store_result_job("no error", :background) } |
| 134 | + 2.times { enqueue_store_result_job("no error", :default) } |
| 135 | + error1 = enqueue_store_result_job("error", :background, exception: RuntimeError) |
| 136 | + enqueue_store_result_job("no error", :background, pause: 0.03) |
| 137 | + error2 = enqueue_store_result_job("error", :background, exception: RuntimeError, pause: 0.05) |
| 138 | + 2.times { enqueue_store_result_job("no error", :default, pause: 0.01) } |
| 139 | + error3 = enqueue_store_result_job("error", :default, exception: RuntimeError) |
| 140 | + |
| 141 | + wait_for_jobs_to_finish_for(2.second, except: [ error1, error2, error3 ]) |
| 142 | + |
| 143 | + assert_completed_job_results("no error", :background, 3) |
| 144 | + assert_completed_job_results("no error", :default, 4) |
| 145 | + |
| 146 | + wait_while_with_timeout(1.second) { SolidQueue::FailedExecution.count < 3 } |
| 147 | + [ error1, error2, error3 ].each do |job| |
| 148 | + assert_job_status(job, :failed) |
| 149 | + end |
| 150 | + |
| 151 | + terminate_process(@pid) |
| 152 | + assert_clean_termination |
| 153 | + end |
| 154 | + |
| 155 | + test "process a job that exits" do |
| 156 | + 2.times do |
| 157 | + enqueue_store_result_job("no exit", :background) |
| 158 | + enqueue_store_result_job("no exit", :default) |
| 159 | + end |
| 160 | + paused_no_exit = enqueue_store_result_job("paused no exit", :default, pause: 0.5) |
| 161 | + exit_job = enqueue_store_result_job("exit", :background, exit_value: 9, pause: 0.2) |
| 162 | + pause_job = enqueue_store_result_job("exit", :background, pause: 0.3) |
| 163 | + |
| 164 | + 2.times { enqueue_store_result_job("no exit", :background) } |
| 165 | + |
| 166 | + wait_for_jobs_to_finish_for(3.seconds, except: [ exit_job, pause_job, paused_no_exit ]) |
| 167 | + |
| 168 | + assert_completed_job_results("no exit", :default, 2) |
| 169 | + assert_completed_job_results("no exit", :background, 4) |
| 170 | + |
| 171 | + # Everything exited because of the exit job, leaving all jobs that ran |
| 172 | + # after it claimed |
| 173 | + [ exit_job, pause_job, paused_no_exit ].each do |job| |
| 174 | + assert_job_status(job, :claimed) |
| 175 | + end |
| 176 | + |
| 177 | + wait_for_process_termination_with_timeout(@pid, exitstatus: 9) |
| 178 | + assert_not process_exists?(@pid) |
| 179 | + |
| 180 | + # Starting a supervisor again will clean things up |
| 181 | + # Claimed jobs will be marked as failed and processes will be pruned |
| 182 | + # Simulate time passing to expire heartbeats |
| 183 | + SolidQueue::Process.update_all(last_heartbeat_at: 1.hour.ago) |
| 184 | + @pid = run_supervisor_as_fork(mode: :async) |
| 185 | + sleep(10) |
| 186 | + |
| 187 | + [ exit_job, pause_job, paused_no_exit ].each do |job| |
| 188 | + assert_job_status(job, :failed) |
| 189 | + end |
| 190 | + |
| 191 | + terminate_process(@pid) |
| 192 | + assert_clean_termination |
| 193 | + end |
| 194 | + |
| 195 | + |
| 196 | + private |
| 197 | + def assert_clean_termination |
| 198 | + wait_for_registered_processes 0, timeout: 0.2.second |
| 199 | + assert_no_registered_processes |
| 200 | + assert_no_claimed_jobs |
| 201 | + assert_not process_exists?(@pid) |
| 202 | + end |
| 203 | + |
| 204 | + def assert_registered_workers_for(*queues, supervisor_pid: nil) |
| 205 | + workers = find_processes_registered_as("Worker") |
| 206 | + registered_queues = workers.map { |process| process.metadata["queues"] }.compact |
| 207 | + assert_equal queues.map(&:to_s).sort, registered_queues.sort |
| 208 | + if supervisor_pid |
| 209 | + assert_equal [ supervisor_pid ], workers.map { |process| process.supervisor.pid }.uniq |
| 210 | + end |
| 211 | + end |
| 212 | + |
| 213 | + def assert_registered_supervisor |
| 214 | + processes = find_processes_registered_as("Supervisor(async)") |
| 215 | + assert_equal 1, processes.count |
| 216 | + assert_equal @pid, processes.first.pid |
| 217 | + end |
| 218 | + |
| 219 | + def assert_no_registered_workers |
| 220 | + assert_empty find_processes_registered_as("Worker").to_a |
| 221 | + end |
| 222 | + |
| 223 | + def enqueue_store_result_job(value, queue_name = :background, **options) |
| 224 | + StoreResultJob.set(queue: queue_name).perform_later(value, **options) |
| 225 | + end |
| 226 | + |
| 227 | + def assert_completed_job_results(value, queue_name = :background, count = 1) |
| 228 | + skip_active_record_query_cache do |
| 229 | + assert_equal count, JobResult.where(queue_name: queue_name, status: "completed", value: value).count |
| 230 | + end |
| 231 | + end |
| 232 | + |
| 233 | + def assert_started_job_result(value, queue_name = :background, count = 1) |
| 234 | + skip_active_record_query_cache do |
| 235 | + assert_equal count, JobResult.where(queue_name: queue_name, status: "started", value: value).count |
| 236 | + end |
| 237 | + end |
| 238 | + |
| 239 | + def assert_job_status(active_job, status) |
| 240 | + # Make sure we skip AR query cache. Otherwise the queries done here |
| 241 | + # might be cached and since we haven't done any non-SELECT queries |
| 242 | + # after they were cached on the connection used in the test, the cache |
| 243 | + # will still apply, even though the data returned by the cached queries |
| 244 | + # might have been deleted in the forked processes. |
| 245 | + skip_active_record_query_cache do |
| 246 | + job = SolidQueue::Job.find_by(active_job_id: active_job.job_id) |
| 247 | + assert job.public_send("#{status}?") |
| 248 | + end |
| 249 | + end |
| 250 | +end |
0 commit comments