Skip to content

Commit 497b4a8

Browse files
committed
[Distributed] kill(::LocalManager, ...) should actually call kill()
When dealing with a local process, if we want to remove a process, we can try a little harder than simply calling `remote_do(exit, id)`. We can actually `kill()` the process by sending `SIGTERM`, then `SIGKILL`. Because we use `Distributed` to run our Base test workers, this can provide a more certain method of closing our workers at the end of test sets, as well as a better way of killing processes such that they dump core in the event that they do get stuck.
1 parent abc1537 commit 497b4a8

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

src/managers.jl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,3 +725,26 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
725725
cancel_ssh_tunnel(config)
726726
nothing
727727
end
728+
729+
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
730+
# First, try sending `exit()` to the remote over the usual control channels
731+
remote_do(exit, pid)
732+
733+
timer_task = @async begin
734+
sleep(exit_timeout)
735+
736+
# Check to see if our child exited, and if not, send an actual kill signal
737+
if !process_exited(config.process)
738+
@warn("Failed to gracefully kill worker $(pid), sending SIGTERM")
739+
kill(config.process, Base.SIGTERM)
740+
741+
sleep(term_timeout)
742+
if !process_exited(config.process)
743+
@warn("Worker $(pid) ignored SIGTERM, sending SIGKILL")
744+
kill(config.process, Base.SIGKILL)
745+
end
746+
end
747+
end
748+
errormonitor(timer_task)
749+
return nothing
750+
end

test/distributed_exec.jl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,26 @@ end end
18561856

18571857
include("splitrange.jl")
18581858

1859+
# Clear all workers for timeout tests (issue #45785)
1860+
rmprocs(workers())
1861+
begin
1862+
# First, assert that we get no messages when we close a cooperative worker
1863+
w = only(addprocs(1))
1864+
@test_nowarn begin
1865+
wait(rmprocs([w]))
1866+
end
1867+
1868+
# Next, ensure we get a log message when a worker does not cleanly exit
1869+
w = only(addprocs(1))
1870+
@test_logs (:warn, r"sending SIGTERM") begin
1871+
remote_do(w) do
1872+
# Cause the 'exit()' message that `rmprocs()` sends to do nothing
1873+
Core.eval(Base, :(exit() = nothing))
1874+
end
1875+
wait(rmprocs([w]))
1876+
end
1877+
end
1878+
18591879
# Run topology tests last after removing all workers, since a given
18601880
# cluster at any time only supports a single topology.
18611881
rmprocs(workers())

0 commit comments

Comments
 (0)