@@ -15,6 +15,7 @@ use std::future;
15
15
use std:: io;
16
16
use std:: io:: Write ;
17
17
use std:: os:: unix:: process:: ExitStatusExt ;
18
+ use std:: path:: Path ;
18
19
use std:: path:: PathBuf ;
19
20
use std:: process:: Stdio ;
20
21
use std:: sync:: Arc ;
@@ -34,6 +35,7 @@ use hyperactor::ProcId;
34
35
use hyperactor:: attrs:: Attrs ;
35
36
use hyperactor:: channel;
36
37
use hyperactor:: channel:: ChannelAddr ;
38
+ use hyperactor:: channel:: ChannelError ;
37
39
use hyperactor:: channel:: ChannelTransport ;
38
40
use hyperactor:: channel:: Rx ;
39
41
use hyperactor:: channel:: Tx ;
@@ -48,10 +50,13 @@ use hyperactor::host::HostError;
48
50
use hyperactor:: host:: ProcHandle ;
49
51
use hyperactor:: host:: ProcManager ;
50
52
use hyperactor:: host:: TerminateSummary ;
53
+ use hyperactor:: mailbox:: IntoBoxedMailboxSender ;
54
+ use hyperactor:: mailbox:: MailboxClient ;
51
55
use hyperactor:: mailbox:: MailboxServer ;
52
56
use hyperactor:: proc:: Proc ;
53
57
use serde:: Deserialize ;
54
58
use serde:: Serialize ;
59
+ use tempfile:: TempDir ;
55
60
use tokio:: process:: Child ;
56
61
use tokio:: process:: Command ;
57
62
use tokio:: sync:: oneshot;
@@ -64,6 +69,8 @@ use crate::v1;
64
69
use crate :: v1:: host_mesh:: mesh_agent:: HostAgentMode ;
65
70
use crate :: v1:: host_mesh:: mesh_agent:: HostMeshAgent ;
66
71
72
+ mod mailbox;
73
+
67
74
declare_attrs ! {
68
75
/// If enabled (default), bootstrap child processes install
69
76
/// `PR_SET_PDEATHSIG(SIGKILL)` so the kernel reaps them if the
@@ -212,6 +219,10 @@ pub enum Bootstrap {
212
219
backend_addr : ChannelAddr ,
213
220
/// The callback address used to indicate successful spawning.
214
221
callback_addr : ChannelAddr ,
222
+ /// Directory for storing proc socket files. Procs place their sockets
223
+ /// in this directory, so that they can be looked up by other procs
224
+ /// for direct transfer.
225
+ socket_dir_path : PathBuf ,
215
226
/// Optional config snapshot (`hyperactor::config::Attrs`)
216
227
/// captured by the parent. If present, the child installs it
217
228
/// as the `Runtime` layer so the parent's effective config
@@ -324,6 +335,7 @@ impl Bootstrap {
324
335
proc_id,
325
336
backend_addr,
326
337
callback_addr,
338
+ socket_dir_path,
327
339
config,
328
340
} => {
329
341
if let Some ( attrs) = config {
@@ -343,15 +355,39 @@ impl Bootstrap {
343
355
eprintln ! ( "(bootstrap) PDEATHSIG disabled via config" ) ;
344
356
}
345
357
346
- let result =
347
- host:: spawn_proc ( proc_id, backend_addr, callback_addr, |proc| async move {
348
- ProcMeshAgent :: boot_v1 ( proc) . await
349
- } )
350
- . await ;
351
- match result {
352
- Ok ( _proc) => halt ( ) . await ,
353
- Err ( e) => e. into ( ) ,
354
- }
358
+ let ( local_addr, name) = ok ! ( proc_id
359
+ . as_direct( )
360
+ . ok_or_else( || anyhow:: anyhow!( "invalid proc id type: {}" , proc_id) ) ) ;
361
+ // TODO provide a direct way to construct these
362
+ let serve_addr = format ! ( "unix:{}" , socket_dir_path. join( name) . display( ) ) ;
363
+ let serve_addr = serve_addr. parse ( ) . unwrap ( ) ;
364
+
365
+ // The following is a modified host::spawn_proc to support direct
366
+ // dialing between local procs: 1) we bind each proc to a deterministic
367
+ // address in socket_dir_path; 2) we use LocalProcDialer to dial these
368
+ // addresses for local procs.
369
+ let proc_sender = mailbox:: LocalProcDialer :: new (
370
+ local_addr. clone ( ) ,
371
+ socket_dir_path,
372
+ ok ! ( MailboxClient :: dial( backend_addr) ) ,
373
+ ) ;
374
+
375
+ let proc = Proc :: new ( proc_id. clone ( ) , proc_sender. into_boxed ( ) ) ;
376
+
377
+ let agent_handle = ok ! ( ProcMeshAgent :: boot_v1( proc. clone( ) )
378
+ . await
379
+ . map_err( |e| HostError :: AgentSpawnFailure ( proc_id, e) ) ) ;
380
+
381
+ // Finally serve the proc on the same transport as the backend address,
382
+ // and call back.
383
+ let ( proc_addr, proc_rx) = ok ! ( channel:: serve( serve_addr) ) ;
384
+ proc. clone ( ) . serve ( proc_rx) ;
385
+ ok ! ( ok!( channel:: dial( callback_addr) )
386
+ . send( ( proc_addr, agent_handle. bind:: <ProcMeshAgent >( ) ) )
387
+ . await
388
+ . map_err( ChannelError :: from) ) ;
389
+
390
+ halt ( ) . await
355
391
}
356
392
Bootstrap :: Host {
357
393
addr,
@@ -369,7 +405,7 @@ impl Bootstrap {
369
405
Some ( command) => command,
370
406
None => ok ! ( BootstrapCommand :: current( ) ) ,
371
407
} ;
372
- let manager = BootstrapProcManager :: new ( command) ;
408
+ let manager = BootstrapProcManager :: new ( command) . unwrap ( ) ;
373
409
let ( host, _handle) = ok ! ( Host :: serve( manager, addr) . await ) ;
374
410
let addr = host. addr ( ) . clone ( ) ;
375
411
let host_mesh_agent = ok ! ( host
@@ -1402,6 +1438,11 @@ pub struct BootstrapProcManager {
1402
1438
/// exclusively in the [`Drop`] impl to send `SIGKILL` without
1403
1439
/// needing async context.
1404
1440
pid_table : Arc < std:: sync:: Mutex < HashMap < ProcId , u32 > > > ,
1441
+
1442
+ /// Directory for storing proc socket files. Procs place their sockets
1443
+ /// in this directory, so that they can be looked up by other procs
1444
+ /// for direct transfer.
1445
+ socket_dir : TempDir ,
1405
1446
}
1406
1447
1407
1448
impl Drop for BootstrapProcManager {
@@ -1451,12 +1492,13 @@ impl BootstrapProcManager {
1451
1492
/// This is the general entry point when you want to manage procs
1452
1493
/// backed by a specific binary path (e.g. a bootstrap
1453
1494
/// trampoline).
1454
- pub ( crate ) fn new ( command : BootstrapCommand ) -> Self {
1455
- Self {
1495
+ pub ( crate ) fn new ( command : BootstrapCommand ) -> Result < Self , io :: Error > {
1496
+ Ok ( Self {
1456
1497
command,
1457
1498
children : Arc :: new ( tokio:: sync:: Mutex :: new ( HashMap :: new ( ) ) ) ,
1458
1499
pid_table : Arc :: new ( std:: sync:: Mutex :: new ( HashMap :: new ( ) ) ) ,
1459
- }
1500
+ socket_dir : tempfile:: tempdir ( ) ?,
1501
+ } )
1460
1502
}
1461
1503
1462
1504
/// The bootstrap command used to launch processes.
@@ -1628,6 +1670,7 @@ impl ProcManager for BootstrapProcManager {
1628
1670
proc_id : proc_id. clone ( ) ,
1629
1671
backend_addr,
1630
1672
callback_addr,
1673
+ socket_dir_path : self . socket_dir . path ( ) . to_owned ( ) ,
1631
1674
config : Some ( cfg) ,
1632
1675
} ;
1633
1676
let mut cmd = Command :: new ( & self . command . program ) ;
@@ -2062,6 +2105,7 @@ mod tests {
2062
2105
proc_id : id ! ( foo[ 0 ] ) ,
2063
2106
backend_addr : ChannelAddr :: any ( ChannelTransport :: Tcp ) ,
2064
2107
callback_addr : ChannelAddr :: any ( ChannelTransport :: Unix ) ,
2108
+ socket_dir_path : PathBuf :: from ( "notexist" ) ,
2065
2109
config : None ,
2066
2110
} ,
2067
2111
] ;
@@ -2119,13 +2163,16 @@ mod tests {
2119
2163
attrs[ MESH_TAIL_LOG_LINES ] = 123 ;
2120
2164
attrs[ MESH_BOOTSTRAP_ENABLE_PDEATHSIG ] = false ;
2121
2165
2166
+ let socket_dir = tempfile:: tempdir ( ) . unwrap ( ) ;
2167
+
2122
2168
// Proc case
2123
2169
{
2124
2170
let original = Bootstrap :: Proc {
2125
2171
proc_id : id ! ( foo[ 42 ] ) ,
2126
2172
backend_addr : ChannelAddr :: any ( ChannelTransport :: Unix ) ,
2127
2173
callback_addr : ChannelAddr :: any ( ChannelTransport :: Unix ) ,
2128
2174
config : Some ( attrs. clone ( ) ) ,
2175
+ socket_dir_path : socket_dir. path ( ) . to_owned ( ) ,
2129
2176
} ;
2130
2177
let env_str = original. to_env_safe_string ( ) . expect ( "encode bootstrap" ) ;
2131
2178
let decoded = Bootstrap :: from_env_safe_string ( & env_str) . expect ( "decode bootstrap" ) ;
@@ -2165,14 +2212,13 @@ mod tests {
2165
2212
use std:: process:: Stdio ;
2166
2213
2167
2214
use tokio:: process:: Command ;
2168
- use tokio:: time:: Duration ;
2169
2215
2170
2216
// Manager; program path is irrelevant for this test.
2171
2217
let command = BootstrapCommand {
2172
2218
program : PathBuf :: from ( "/bin/true" ) ,
2173
2219
..Default :: default ( )
2174
2220
} ;
2175
- let manager = BootstrapProcManager :: new ( command) ;
2221
+ let manager = BootstrapProcManager :: new ( command) . unwrap ( ) ;
2176
2222
2177
2223
// Spawn a long-running child process (sleep 30) with
2178
2224
// kill_on_drop(true).
@@ -2552,7 +2598,7 @@ mod tests {
2552
2598
program : PathBuf :: from ( "/bin/true" ) ,
2553
2599
..Default :: default ( )
2554
2600
} ;
2555
- let manager = BootstrapProcManager :: new ( command) ;
2601
+ let manager = BootstrapProcManager :: new ( command) . unwrap ( ) ;
2556
2602
2557
2603
// Spawn a fast-exiting child.
2558
2604
let mut cmd = Command :: new ( "true" ) ;
@@ -2586,7 +2632,7 @@ mod tests {
2586
2632
program : PathBuf :: from ( "/bin/sleep" ) ,
2587
2633
..Default :: default ( )
2588
2634
} ;
2589
- let manager = BootstrapProcManager :: new ( command) ;
2635
+ let manager = BootstrapProcManager :: new ( command) . unwrap ( ) ;
2590
2636
2591
2637
// Spawn a process that will live long enough to kill.
2592
2638
let mut cmd = Command :: new ( "/bin/sleep" ) ;
@@ -2703,7 +2749,8 @@ mod tests {
2703
2749
let manager = BootstrapProcManager :: new ( BootstrapCommand {
2704
2750
program : PathBuf :: from ( "/bin/true" ) ,
2705
2751
..Default :: default ( )
2706
- } ) ;
2752
+ } )
2753
+ . unwrap ( ) ;
2707
2754
let unknown = ProcId :: Direct ( ChannelAddr :: any ( ChannelTransport :: Unix ) , "nope" . into ( ) ) ;
2708
2755
assert ! ( manager. status( & unknown) . await . is_none( ) ) ;
2709
2756
}
@@ -2713,7 +2760,8 @@ mod tests {
2713
2760
let manager = BootstrapProcManager :: new ( BootstrapCommand {
2714
2761
program : PathBuf :: from ( "/bin/sleep" ) ,
2715
2762
..Default :: default ( )
2716
- } ) ;
2763
+ } )
2764
+ . unwrap ( ) ;
2717
2765
2718
2766
// Long-ish child so it's alive while we "steal" it.
2719
2767
let mut cmd = Command :: new ( "/bin/sleep" ) ;
@@ -2752,7 +2800,8 @@ mod tests {
2752
2800
let manager = BootstrapProcManager :: new ( BootstrapCommand {
2753
2801
program : PathBuf :: from ( "/bin/sleep" ) ,
2754
2802
..Default :: default ( )
2755
- } ) ;
2803
+ } )
2804
+ . unwrap ( ) ;
2756
2805
2757
2806
let mut cmd = Command :: new ( "/bin/sleep" ) ;
2758
2807
cmd. arg ( "5" ) . stdout ( Stdio :: null ( ) ) . stderr ( Stdio :: null ( ) ) ;
@@ -3105,8 +3154,6 @@ mod tests {
3105
3154
instance : & hyperactor:: Instance < ( ) > ,
3106
3155
_tag : & str ,
3107
3156
) -> ( ProcId , ChannelAddr ) {
3108
- let proc_id = id ! ( bootstrap_child[ 0 ] ) ;
3109
-
3110
3157
// Serve a Unix channel as the "backend_addr" and hook it into
3111
3158
// this test proc.
3112
3159
let ( backend_addr, rx) = channel:: serve ( ChannelAddr :: any ( ChannelTransport :: Unix ) ) . unwrap ( ) ;
@@ -3116,6 +3163,9 @@ mod tests {
3116
3163
// router.
3117
3164
instance. proc ( ) . clone ( ) . serve ( rx) ;
3118
3165
3166
+ // We return an arbitrary (but unbound!) unix direct proc id here;
3167
+ // it is okay, as we're not testing connectivity.
3168
+ let proc_id = ProcId :: Direct ( ChannelTransport :: Unix . any ( ) , "test" . to_string ( ) ) ;
3119
3169
( proc_id, backend_addr)
3120
3170
}
3121
3171
@@ -3127,7 +3177,7 @@ mod tests {
3127
3177
. unwrap ( ) ;
3128
3178
let ( instance, _handle) = root. instance ( "client" ) . unwrap ( ) ;
3129
3179
3130
- let mgr = BootstrapProcManager :: new ( BootstrapCommand :: test ( ) ) ;
3180
+ let mgr = BootstrapProcManager :: new ( BootstrapCommand :: test ( ) ) . unwrap ( ) ;
3131
3181
let ( proc_id, backend_addr) = make_proc_id_and_backend_addr ( & instance, "t_term" ) . await ;
3132
3182
let handle = mgr
3133
3183
. spawn ( proc_id. clone ( ) , backend_addr. clone ( ) )
@@ -3183,7 +3233,7 @@ mod tests {
3183
3233
. unwrap ( ) ;
3184
3234
let ( instance, _handle) = root. instance ( "client" ) . unwrap ( ) ;
3185
3235
3186
- let mgr = BootstrapProcManager :: new ( BootstrapCommand :: test ( ) ) ;
3236
+ let mgr = BootstrapProcManager :: new ( BootstrapCommand :: test ( ) ) . unwrap ( ) ;
3187
3237
3188
3238
// Proc identity + host backend channel the child will dial.
3189
3239
let ( proc_id, backend_addr) = make_proc_id_and_backend_addr ( & instance, "t_kill" ) . await ;
@@ -3382,7 +3432,8 @@ mod tests {
3382
3432
let manager = BootstrapProcManager :: new ( BootstrapCommand {
3383
3433
program : std:: path:: PathBuf :: from ( "/bin/true" ) , // unused in this test
3384
3434
..Default :: default ( )
3385
- } ) ;
3435
+ } )
3436
+ . unwrap ( ) ;
3386
3437
manager. spawn_exit_monitor ( proc_id. clone ( ) , handle. clone ( ) ) ;
3387
3438
3388
3439
// Await terminal status and assert on exit code and stderr
0 commit comments