@@ -870,6 +870,8 @@ const LPROC = LocalProcess()
870870const LPROCROLE = Ref {Symbol} (:master )
871871const HDR_VERSION_LEN= 16
872872const HDR_COOKIE_LEN= 16
873+ const map_pid_statuses = Dict {Int, Any} ()
874+ const map_pid_statuses_lock = ReentrantLock ()
873875const map_pid_wrkr = Dict {Int, Union{Worker, LocalProcess}} ()
874876const map_sock_wrkr = IdDict ()
875877const map_del_wrkr = Set {Int} ()
@@ -1010,15 +1012,16 @@ for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
10101012segfaulting etc). Chooses and returns a unique key for the callback if `key` is
10111013not specified.
10121014
1013- The callback will be called with the worker ID and the final
1014- `Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
1015+ The callback will be called with the worker ID, the final
1016+ `Distributed.WorkerState` of the worker, and the last status of the worker as
1017+ set by [`setstatus`](@ref), e.g. `f(w::Int, state, status)`. `state` is an
10151018enum, a value of `WorkerState_terminated` means a graceful exit and a value of
10161019`WorkerState_exterminated` means the worker died unexpectedly.
10171020
10181021If the callback throws an exception it will be caught and printed.
10191022"""
10201023add_worker_exited_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exited_callbacks;
1021- arg_types= Tuple{Int, WorkerState})
1024+ arg_types= Tuple{Int, WorkerState, Any })
10221025
10231026"""
10241027 remove_worker_exited_callback(key)
@@ -1206,6 +1209,59 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out.
12061209"""
12071210other_workers () = filter (!= (myid ()), workers ())
12081211
1212+ """
1213+ setstatus(x, pid::Int=myid())
1214+
1215+ Set the status for worker `pid` to `x`. `x` may be any serializable object but
1216+ it's recommended to keep it small enough to cheaply send over a network. The
1217+ status will be passed to the worker-exited callbacks (see
1218+ [`add_worker_exited_callback`](@ref)) when the worker exits.
1219+
1220+ This can be handy if you want a way to know what a worker is doing at any given
1221+ time, or (in combination with a worker-exited callback) for knowing what a
1222+ worker was last doing before it died.
1223+
1224+ # Examples
1225+ ```julia-repl
1226+ julia> DistributedNext.setstatus("working on dataset 42")
1227+ "working on dataset 42"
1228+
1229+ julia> DistributedNext.getstatus()
1230+ "working on dataset 42"
1231+ ```
1232+ """
1233+ function setstatus (x, pid:: Int = myid ())
1234+ if pid ∉ procs ()
1235+ throw (ArgumentError (" Worker $(pid) does not exist, cannot set its status" ))
1236+ end
1237+
1238+ if myid () == 1
1239+ @lock map_pid_statuses_lock map_pid_statuses[pid] = x
1240+ else
1241+ remotecall_fetch (setstatus, 1 , x, myid ())
1242+ end
1243+ end
1244+
1245+ _getstatus (pid) = @lock map_pid_statuses_lock get! (map_pid_statuses, pid, nothing )
1246+
1247+ """
1248+ getstatus(pid::Int=myid())
1249+
1250+ Get the status for worker `pid`. If one was never explicitly set with
1251+ [`setstatus`](@ref) this will return `nothing`.
1252+ """
1253+ function getstatus (pid:: Int = myid ())
1254+ if pid ∉ procs ()
1255+ throw (ArgumentError (" Worker $(pid) does not exist, cannot get its status" ))
1256+ end
1257+
1258+ if myid () == 1
1259+ _getstatus (pid)
1260+ else
1261+ remotecall_fetch (getstatus, 1 , pid)
1262+ end
1263+ end
1264+
12091265function cluster_mgmt_from_master_check ()
12101266 if myid () != 1
12111267 throw (ErrorException (" Only process 1 can add and remove workers" ))
@@ -1425,15 +1481,20 @@ function deregister_worker(pg, pid)
14251481 end
14261482 end
14271483
1428- # Call callbacks on the master
14291484 if myid () == 1
1485+ status = _getstatus (pid)
1486+
1487+ # Call callbacks on the master
14301488 for (name, callback) in worker_exited_callbacks
14311489 try
1432- callback (pid, w. state)
1490+ callback (pid, w. state, status )
14331491 catch ex
14341492 @error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
14351493 end
14361494 end
1495+
1496+ # Delete its status
1497+ @lock map_pid_statuses_lock delete! (map_pid_statuses, pid)
14371498 end
14381499
14391500 return
0 commit comments