@@ -112,7 +112,7 @@ impl Actor for ParameterServerActor {
112
112
113
113
async fn new ( _params : Self :: Params ) -> Result < Self , anyhow:: Error > {
114
114
let ( owner_ref, worker_world_size) = _params;
115
- println ! ( "creating parameter server actor" ) ;
115
+ tracing :: info !( "creating parameter server actor" ) ;
116
116
let weights_data = vec ! [ 0u8 ; BUFFER_SIZE ] . into_boxed_slice ( ) ;
117
117
let grad_buffer_data =
118
118
vec ! [ vec![ 0u8 ; BUFFER_SIZE ] . into_boxed_slice( ) ; worker_world_size] . into_boxed_slice ( ) ;
@@ -203,7 +203,7 @@ impl Handler<PsUpdate> for ParameterServerActor {
203
203
}
204
204
grad. fill ( 0 ) ;
205
205
}
206
- println ! ( "[parameter server actor] updated" ) ;
206
+ tracing :: info !( "[parameter server actor] updated" ) ;
207
207
reply. send ( cx, true ) ?;
208
208
Ok ( ( ) )
209
209
}
@@ -213,9 +213,10 @@ impl Handler<PsUpdate> for ParameterServerActor {
213
213
impl Handler < Log > for ParameterServerActor {
214
214
/// Logs the server's weights and gradient buffer
215
215
async fn handle ( & mut self , _this_ : & Context < Self > , _msg_ : Log ) -> Result < ( ) , anyhow:: Error > {
216
- println ! (
216
+ tracing :: info !(
217
217
"[parameter server actor] weights: {:?}, grad_buffer: {:?}" ,
218
- self . weights_data, self . grad_buffer_data,
218
+ self . weights_data,
219
+ self . grad_buffer_data,
219
220
) ;
220
221
Ok ( ( ) )
221
222
}
@@ -305,7 +306,7 @@ impl Handler<WorkerInit> for WorkerActor {
305
306
) -> Result < ( ) , anyhow:: Error > {
306
307
let ( rank, _) = cx. cast_info ( ) ;
307
308
308
- println ! ( "[worker_actor_{}] initializing" , rank) ;
309
+ tracing :: info !( "[worker_actor_{}] initializing" , rank) ;
309
310
310
311
let client = cx. mailbox_for_py ( ) ;
311
312
let ( handle, receiver) = client. open_once_port :: < ( RdmaBuffer , RdmaBuffer ) > ( ) ;
@@ -345,9 +346,10 @@ impl Handler<WorkerStep> for WorkerActor {
345
346
{
346
347
* grad_value = grad_value. wrapping_add ( * weight + 1 ) ;
347
348
}
348
- println ! (
349
+ tracing :: info !(
349
350
"[worker_actor_{}] pushing gradients {:?}" ,
350
- rank, self . local_gradients
351
+ rank,
352
+ self . local_gradients
351
353
) ;
352
354
353
355
let owner_ref = self
@@ -387,9 +389,10 @@ impl Handler<WorkerUpdate> for WorkerActor {
387
389
) -> Result < ( ) , anyhow:: Error > {
388
390
let ( rank, _) = cx. cast_info ( ) ;
389
391
390
- println ! (
392
+ tracing :: info !(
391
393
"[worker_actor_{}] pulling new weights from parameter server (before: {:?})" ,
392
- rank, self . weights_data,
394
+ rank,
395
+ self . weights_data,
393
396
) ;
394
397
let /*mut*/ lbuffer = self
395
398
. rdma_manager
@@ -419,7 +422,7 @@ impl Handler<Log> for WorkerActor {
419
422
/// Logs the worker's weights
420
423
async fn handle ( & mut self , cx : & Context < Self > , _: Log ) -> Result < ( ) , anyhow:: Error > {
421
424
let ( rank, _) = cx. cast_info ( ) ;
422
- println ! ( "[worker_actor_{}] weights: {:?}" , rank, self . weights_data) ;
425
+ tracing :: info !( "[worker_actor_{}] weights: {:?}" , rank, self . weights_data) ;
423
426
Ok ( ( ) )
424
427
}
425
428
}
@@ -456,7 +459,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
456
459
} ;
457
460
} else {
458
461
// For other configurations, use default settings (parameter server + workers all use the same ibv device)
459
- println ! (
462
+ tracing :: info !(
460
463
"using default IbverbsConfig as {} devices were found (expected > 4 for H100)" ,
461
464
devices. len( )
462
465
) ;
@@ -465,10 +468,10 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
465
468
}
466
469
467
470
// As normal, create a proc mesh for the parameter server.
468
- println ! ( "creating parameter server proc mesh..." ) ;
471
+ tracing :: info !( "creating parameter server proc mesh..." ) ;
469
472
470
473
let mut alloc = ProcessAllocator :: new ( Command :: new (
471
- buck_resources:: get ( "monarch/monarch_rdma/examples/bootstrap" ) . unwrap ( ) ,
474
+ buck_resources:: get ( "monarch/monarch_rdma/examples/parameter_server/ bootstrap" ) . unwrap ( ) ,
472
475
) ) ;
473
476
474
477
let ps_proc_mesh = ProcMesh :: allocate (
@@ -481,7 +484,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
481
484
)
482
485
. await ?;
483
486
484
- println ! (
487
+ tracing :: info !(
485
488
"creating parameter server's RDMA manager with config: {}" ,
486
489
ps_ibv_config
487
490
) ;
@@ -496,7 +499,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
496
499
. unwrap ( ) ;
497
500
498
501
// Create a proc mesh for workers, where each worker is assigned to its own GPU.
499
- println ! ( "creating worker proc mesh ({} workers)..." , num_workers) ;
502
+ tracing :: info !( "creating worker proc mesh ({} workers)..." , num_workers) ;
500
503
let worker_proc_mesh = ProcMesh :: allocate (
501
504
alloc
502
505
. allocate ( AllocSpec {
@@ -507,7 +510,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
507
510
)
508
511
. await ?;
509
512
510
- println ! (
513
+ tracing :: info !(
511
514
"creating worker's RDMA manager with config: {}" ,
512
515
worker_ibv_config
513
516
) ;
@@ -517,7 +520,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
517
520
. await
518
521
. unwrap ( ) ;
519
522
520
- println ! ( "spawning parameter server" ) ;
523
+ tracing :: info !( "spawning parameter server" ) ;
521
524
let ps_actor_mesh: RootActorMesh < ' _ , ParameterServerActor > = ps_proc_mesh
522
525
. spawn (
523
526
"parameter_server" ,
@@ -529,7 +532,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
529
532
// The parameter server is a single actor, we can just grab it and call it directly.
530
533
let ps_actor = ps_actor_mesh. iter ( ) . next ( ) . unwrap ( ) ;
531
534
532
- println ! ( "spawning worker actors" ) ;
535
+ tracing :: info !( "spawning worker actors" ) ;
533
536
let worker_actor_mesh: RootActorMesh < ' _ , WorkerActor > =
534
537
worker_proc_mesh. spawn ( "worker_actors" , & ( ) ) . await . unwrap ( ) ;
535
538
@@ -539,7 +542,7 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
539
542
// We intentionally decouple spawning with initialization, which is fairly common in Ray workloads
540
543
// In this case, we use it for dual purpose - be able to use the cast APIs to assign rank (Monarch specific) and
541
544
// to get access to return values for error messaging (applies to both Monarch and Ray)
542
- println ! ( "initializing worker actor mesh" ) ;
545
+ tracing :: info !( "initializing worker actor mesh" ) ;
543
546
worker_actor_mesh
544
547
. cast (
545
548
worker_proc_mesh. client ( ) ,
@@ -549,9 +552,9 @@ pub async fn run(num_workers: usize, num_steps: usize) -> Result<(), anyhow::Err
549
552
)
550
553
. unwrap ( ) ;
551
554
552
- println ! ( "starting training loop" ) ;
555
+ tracing :: info !( "starting training loop" ) ;
553
556
for step in 0 ..num_steps {
554
- println ! ( "===== starting step {} =====" , step) ;
557
+ tracing :: info !( "===== starting step {} =====" , step) ;
555
558
worker_actor_mesh
556
559
. cast (
557
560
worker_proc_mesh. client ( ) ,
0 commit comments