Skip to content

Commit 34c7f91

Browse files
IanButterworthJamesWrigley
authored andcommitted
sig profile hanging workers before SIGKILL
1 parent e4e054d commit 34c7f91

File tree

2 files changed

+12
-4
lines changed

2 files changed

+12
-4
lines changed

src/managers.jl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ addprocs([
126126
127127
* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String`
128128
holding one flag, or a collection of strings, with one element per flag.
129-
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
129+
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
130130
131131
* `topology`: Specifies how the workers connect to each other. Sending a message between
132132
unconnected workers results in an error.
@@ -767,7 +767,8 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
767767
nothing
768768
end
769769

770-
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
770+
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15)
771+
# profile_wait = 6 is 1s for profile, 5s for the report to show
771772
# First, try sending `exit()` to the remote over the usual control channels
772773
remote_do(exit, pid)
773774

@@ -776,7 +777,14 @@ function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeou
776777

777778
# Check to see if our child exited, and if not, send an actual kill signal
778779
if !process_exited(config.process)
779-
@warn("Failed to gracefully kill worker $(pid), sending SIGQUIT")
780+
@warn "Failed to gracefully kill worker $(pid)"
781+
profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10)
782+
if profile_sig !== nothing
783+
@warn("Sending profile $(profile_sig[1]) to worker $(pid)")
784+
kill(config.process, profile_sig[2])
785+
sleep(profile_wait)
786+
end
787+
@warn("Sending SIGQUIT to worker $(pid)")
780788
kill(config.process, Base.SIGQUIT)
781789

782790
sleep(term_timeout)

test/distributed_exec.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1934,7 +1934,7 @@ include("splitrange.jl")
19341934

19351935
# Next, ensure we get a log message when a worker does not cleanly exit
19361936
w = only(addprocs(1))
1937-
@test_logs (:warn, r"sending SIGQUIT") begin
1937+
@test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin
19381938
remote_do(w) do
19391939
# Cause the 'exit()' message that `rmprocs()` sends to do nothing
19401940
Core.eval(Base, :(exit() = nothing))

0 commit comments

Comments
 (0)