@@ -457,22 +457,34 @@ end
457457```
458458"""
459459function addprocs (manager:: ClusterManager ; kwargs... )
460+ params = merge (default_addprocs_params (manager), Dict {Symbol, Any} (kwargs))
461+
460462 init_multi ()
461463
462464 cluster_mgmt_from_master_check ()
463465
464- new_workers = @lock worker_lock addprocs_locked (manager:: ClusterManager ; kwargs... )
466+ new_workers = @lock worker_lock addprocs_locked (manager:: ClusterManager , params)
467+
468+ callback_tasks = Dict {Any, Task} ()
465469 for worker in new_workers
466- for callback in values ( worker_added_callbacks)
467- callback (worker)
470+ for (name, callback) in worker_added_callbacks
471+ callback_tasks[name] = Threads . @spawn callback (worker)
468472 end
469473 end
470474
475+ running_callbacks = () -> [" '$(key) '" for (key, task) in callback_tasks if ! istaskdone (task)]
476+ while timedwait (() -> isempty (running_callbacks ()), params[:callback_warning_interval ]) === :timed_out
477+ callbacks_str = join (running_callbacks (), " , " )
478+ @warn " Waiting for these worker-added callbacks to finish: $(callbacks_str) "
479+ end
480+
481+ # Wait on the tasks so that exceptions bubble up
482+ wait .(values (callback_tasks))
483+
471484 return new_workers
472485end
473486
474- function addprocs_locked (manager:: ClusterManager ; kwargs... )
475- params = merge (default_addprocs_params (manager), Dict {Symbol,Any} (kwargs))
487+ function addprocs_locked (manager:: ClusterManager , params)
476488 topology (Symbol (params[:topology ]))
477489
478490 if PGRP. topology != = :all_to_all
@@ -559,7 +571,8 @@ default_addprocs_params() = Dict{Symbol,Any}(
559571 :exeflags => ` ` ,
560572 :env => [],
561573 :enable_threaded_blas => false ,
562- :lazy => true )
574+ :lazy => true ,
575+ :callback_warning_interval => 10 )
563576
564577
565578function setup_launched_worker (manager, wconfig, launched_q)
872885function _add_callback (f, key, dict)
873886 if ! hasmethod (f, Tuple{Int})
874887 throw (ArgumentError (" Callback function is invalid, it must be able to accept a single Int argument" ))
888+ elseif haskey (dict, key)
889+ throw (ArgumentError (" A callback function with key '$(key) ' already exists" ))
875890 end
876891
877892 if isnothing (key)
@@ -889,14 +904,23 @@ _remove_callback(key, dict) = delete!(dict, key)
889904
890905Register a callback to be called on the master process whenever a worker is
891906added. The callback will be called with the added worker ID,
892- e.g. `f(w::Int)`. Returns a unique key for the callback.
907+ e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
908+ not specified.
909+
910+ The worker-added callbacks will be executed concurrently. If one throws an
911+ exception it will not be caught and will bubble up through [`addprocs()`](@ref).
912+
913+ Keep in mind that the callbacks will add to the time taken to launch workers; so
914+ try to either keep the callbacks fast to execute, or do the actual
915+ initialization asynchronously by spawning a task in the callback (beware of race
916+ conditions if you do this).
893917"""
894918add_worker_added_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_added_callbacks)
895919
896920"""
897921 remove_worker_added_callback(key)
898922
899- Remove the callback for `key`.
923+ Remove the callback for `key` that was added with [`add_worker_added_callback()`](@ref) .
900924"""
901925remove_worker_added_callback (key) = _remove_callback (key, worker_added_callbacks)
902926
@@ -905,18 +929,19 @@ remove_worker_added_callback(key) = _remove_callback(key, worker_added_callbacks
905929
906930Register a callback to be called on the master process immediately before a
907931worker is removed with [`rmprocs()`](@ref). The callback will be called with the
908- worker ID, e.g. `f(w::Int)`. Returns a unique key for the callback.
932+ worker ID, e.g. `f(w::Int)`. Chooses and returns a unique key for the callback
933+ if `key` is not specified.
909934
910- All callbacks will be executed asynchronously and if they don't all finish
911- before the `callback_timeout` passed to `rmprocs()` then the process will be
912- removed anyway.
935+ All worker-exiting callbacks will be executed concurrently and if they don't
936+ all finish before the `callback_timeout` passed to `rmprocs()` then the process
937+ will be removed anyway.
913938"""
914939add_worker_exiting_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exiting_callbacks)
915940
916941"""
917942 remove_worker_exiting_callback(key)
918943
919- Remove the callback for `key`.
944+ Remove the callback for `key` that was added with [`add_worker_exiting_callback()`](@ref) .
920945"""
921946remove_worker_exiting_callback (key) = _remove_callback (key, worker_exiting_callbacks)
922947
@@ -926,14 +951,17 @@ remove_worker_exiting_callback(key) = _remove_callback(key, worker_exiting_callb
926951Register a callback to be called on the master process when a worker has exited
927952for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
928953segfaulting etc). The callback will be called with the worker ID,
929- e.g. `f(w::Int)`. Returns a unique key for the callback.
954+ e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
955+ not specified.
956+
957+ If the callback throws an exception it will be caught and printed.
930958"""
931959add_worker_exited_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exited_callbacks)
932960
933961"""
934962 remove_worker_exited_callback(key)
935963
936- Remove the callback for `key`.
964+ Remove the callback for `key` that was added with [`add_worker_exited_callback()`](@ref) .
937965"""
938966remove_worker_exited_callback (key) = _remove_callback (key, worker_exited_callbacks)
939967
@@ -1176,15 +1204,17 @@ function _rmprocs(pids, waitfor, callback_timeout)
11761204 lock (worker_lock)
11771205 try
11781206 # Run the callbacks
1179- callback_tasks = Task[]
1207+ callback_tasks = Dict {Any, Task} ()
11801208 for pid in pids
1181- for callback in values ( worker_exiting_callbacks)
1182- push! ( callback_tasks, Threads. @spawn callback (pid) )
1209+ for (name, callback) in worker_exiting_callbacks
1210+ callback_tasks[name] = Threads. @spawn callback (pid)
11831211 end
11841212 end
11851213
1186- if timedwait (() -> all (istaskdone .(callback_tasks)), callback_timeout) === :timed_out
1187- @warn " Some callbacks timed out, continuing to remove workers anyway"
1214+ if timedwait (() -> all (istaskdone .(values (callback_tasks))), callback_timeout) === :timed_out
1215+ timedout_callbacks = [" '$(key) '" for (key, task) in callback_tasks if ! istaskdone (task)]
1216+ callbacks_str = join (timedout_callbacks, " , " )
1217+ @warn " Some worker-exiting callbacks have not yet finished, continuing to remove workers anyway. These are the callbacks still running: $(callbacks_str) "
11881218 end
11891219
11901220 rmprocset = Union{LocalProcess, Worker}[]
@@ -1335,8 +1365,12 @@ function deregister_worker(pg, pid)
13351365
13361366 # Call callbacks on the master
13371367 if myid () == 1
1338- for callback in values (worker_exited_callbacks)
1339- callback (pid)
1368+ for (name, callback) in worker_exited_callbacks
1369+ try
1370+ callback (pid)
1371+ catch ex
1372+ @error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
1373+ end
13401374 end
13411375 end
13421376
0 commit comments