@@ -70,6 +70,10 @@ impl<A: Referable> ActorMesh<A> {
70
70
current_ref,
71
71
}
72
72
}
73
+
74
+ pub fn name ( & self ) -> & Name {
75
+ & self . name
76
+ }
73
77
}
74
78
75
79
impl < A : Referable > Deref for ActorMesh < A > {
@@ -491,7 +495,7 @@ mod tests {
491
495
}
492
496
493
497
#[ async_timed_test( timeout_secs = 30 ) ]
494
- async fn test_actor_states ( ) {
498
+ async fn test_actor_states_with_panic ( ) {
495
499
hyperactor_telemetry:: initialize_logging_for_test ( ) ;
496
500
497
501
let instance = testing:: instance ( ) . await ;
@@ -526,22 +530,20 @@ mod tests {
526
530
// status such that when a process switches to unhealthy it sets a
527
531
// supervision event.
528
532
let supervision_task = tokio:: spawn ( async move {
529
- match actor_mesh. actor_states ( & instance) . await {
530
- Ok ( events) => {
531
- for state in events. values ( ) {
532
- supervisor. send ( instance, state. clone ( ) ) . unwrap ( ) ;
533
- }
534
- }
535
- Err ( e) => {
536
- println ! ( "error: {:?}" , e) ;
537
- }
538
- } ;
533
+ let events = actor_mesh. actor_states ( & instance) . await . unwrap ( ) ;
534
+ for state in events. values ( ) {
535
+ supervisor. send ( instance, state. clone ( ) ) . unwrap ( ) ;
536
+ }
539
537
} ) ;
540
538
// Make sure the task completes first without a panic.
541
539
supervision_task. await . unwrap ( ) ;
542
540
543
541
for _ in 0 ..num_replicas {
544
- let state = supervision_receiver. recv ( ) . await . unwrap ( ) ;
542
+ let state = RealClock
543
+ . timeout ( Duration :: from_secs ( 10 ) , supervision_receiver. recv ( ) )
544
+ . await
545
+ . expect ( "timeout" )
546
+ . unwrap ( ) ;
545
547
if let resource:: Status :: Failed ( s) = state. status {
546
548
assert ! ( s. contains( "supervision events" ) ) ;
547
549
} else {
@@ -558,6 +560,143 @@ mod tests {
558
560
}
559
561
}
560
562
563
+ #[ async_timed_test( timeout_secs = 30 ) ]
564
+ async fn test_actor_states_with_process_exit ( ) {
565
+ hyperactor_telemetry:: initialize_logging_for_test ( ) ;
566
+
567
+ let instance = testing:: instance ( ) . await ;
568
+ // Listen for supervision events sent to the parent instance.
569
+ let ( supervision_port, mut supervision_receiver) =
570
+ instance. open_port :: < resource:: State < ActorState > > ( ) ;
571
+ let supervisor = supervision_port. bind ( ) ;
572
+ let num_replicas = 4 ;
573
+ let meshes = testing:: proc_meshes ( instance, extent ! ( replicas = num_replicas) ) . await ;
574
+ let proc_mesh = & meshes[ 1 ] ;
575
+ let child_name = Name :: new ( "child" ) ;
576
+
577
+ let actor_mesh = proc_mesh
578
+ . spawn_with_name :: < testactor:: TestActor > ( instance, child_name. clone ( ) , & ( ) )
579
+ . await
580
+ . unwrap ( ) ;
581
+
582
+ actor_mesh
583
+ . cast (
584
+ instance,
585
+ testactor:: CauseSupervisionEvent ( testactor:: SupervisionEventType :: ProcessExit ( 1 ) ) ,
586
+ )
587
+ . unwrap ( ) ;
588
+
589
+ // Wait for the casted message to cause a process exit on all actors.
590
+ // We can't use a reply port because the handler for the message will
591
+ // by definition not complete and send a reply.
592
+ #[ allow( clippy:: disallowed_methods) ]
593
+ tokio:: time:: sleep ( tokio:: time:: Duration :: from_secs ( 5 ) ) . await ;
594
+
595
+ // Now that all ranks have completed, set up a continuous poll of the
596
+ // status such that when a process switches to unhealthy it sets a
597
+ // supervision event.
598
+ let supervision_task = tokio:: spawn ( async move {
599
+ let events = actor_mesh. actor_states ( & instance) . await . unwrap ( ) ;
600
+ for state in events. values ( ) {
601
+ supervisor. send ( instance, state. clone ( ) ) . unwrap ( ) ;
602
+ }
603
+ } ) ;
604
+ // Make sure the task completes first without a panic.
605
+ RealClock
606
+ . timeout ( Duration :: from_secs ( 10 ) , supervision_task)
607
+ . await
608
+ . expect ( "timeout" )
609
+ . unwrap ( ) ;
610
+
611
+ for _ in 0 ..num_replicas {
612
+ let state = RealClock
613
+ . timeout ( Duration :: from_secs ( 10 ) , supervision_receiver. recv ( ) )
614
+ . await
615
+ . expect ( "timeout" )
616
+ . unwrap ( ) ;
617
+ assert_matches ! ( state. status, resource:: Status :: Stopped ) ;
618
+ let events = state
619
+ . state
620
+ . expect ( "state should be present" )
621
+ . supervision_events ;
622
+ assert_eq ! ( events. len( ) , 1 ) ;
623
+ assert_eq ! ( events[ 0 ] . actor_status, ActorStatus :: Stopped ) ;
624
+ }
625
+ }
626
+
627
+ #[ async_timed_test( timeout_secs = 30 ) ]
628
+ async fn test_actor_states_on_sliced_mesh ( ) {
629
+ hyperactor_telemetry:: initialize_logging_for_test ( ) ;
630
+
631
+ let instance = testing:: instance ( ) . await ;
632
+ // Listen for supervision events sent to the parent instance.
633
+ let ( supervision_port, mut supervision_receiver) =
634
+ instance. open_port :: < resource:: State < ActorState > > ( ) ;
635
+ let supervisor = supervision_port. bind ( ) ;
636
+ let num_replicas = 4 ;
637
+ let meshes = testing:: proc_meshes ( instance, extent ! ( replicas = num_replicas) ) . await ;
638
+ let proc_mesh = & meshes[ 1 ] ;
639
+ let child_name = Name :: new ( "child" ) ;
640
+
641
+ let actor_mesh = proc_mesh
642
+ . spawn_with_name :: < testactor:: TestActor > ( instance, child_name. clone ( ) , & ( ) )
643
+ . await
644
+ . unwrap ( ) ;
645
+ let sliced = actor_mesh
646
+ . range ( "replicas" , 1 ..3 )
647
+ . expect ( "slice should be valid" ) ;
648
+ let sliced_replicas = sliced. len ( ) ;
649
+
650
+ sliced
651
+ . cast (
652
+ instance,
653
+ testactor:: CauseSupervisionEvent ( testactor:: SupervisionEventType :: Panic ) ,
654
+ )
655
+ . unwrap ( ) ;
656
+
657
+ // Wait for the casted message to cause a process exit on all actors.
658
+ // We can't use a reply port because the handler for the message will
659
+ // by definition not complete and send a reply.
660
+ #[ allow( clippy:: disallowed_methods) ]
661
+ tokio:: time:: sleep ( tokio:: time:: Duration :: from_secs ( 5 ) ) . await ;
662
+
663
+ // Now that all ranks have completed, set up a continuous poll of the
664
+ // status such that when a process switches to unhealthy it sets a
665
+ // supervision event.
666
+ let supervision_task = tokio:: spawn ( async move {
667
+ let events = sliced. actor_states ( & instance) . await . unwrap ( ) ;
668
+ for state in events. values ( ) {
669
+ supervisor. send ( instance, state. clone ( ) ) . unwrap ( ) ;
670
+ }
671
+ } ) ;
672
+ // Make sure the task completes first without a panic.
673
+ RealClock
674
+ . timeout ( Duration :: from_secs ( 10 ) , supervision_task)
675
+ . await
676
+ . expect ( "timeout" )
677
+ . unwrap ( ) ;
678
+
679
+ for _ in 0 ..sliced_replicas {
680
+ let state = RealClock
681
+ . timeout ( Duration :: from_secs ( 10 ) , supervision_receiver. recv ( ) )
682
+ . await
683
+ . expect ( "timeout" )
684
+ . unwrap ( ) ;
685
+ if let resource:: Status :: Failed ( s) = state. status {
686
+ assert ! ( s. contains( "supervision events" ) ) ;
687
+ } else {
688
+ panic ! ( "Not failed: {:?}" , state. status) ;
689
+ }
690
+ if let Some ( ref inner) = state. state {
691
+ assert ! ( !inner. supervision_events. is_empty( ) ) ;
692
+ for event in & inner. supervision_events {
693
+ assert_eq ! ( event. actor_id. name( ) , format!( "{}" , child_name. clone( ) ) ) ;
694
+ assert_matches ! ( event. actor_status, ActorStatus :: Failed ( _) ) ;
695
+ }
696
+ }
697
+ }
698
+ }
699
+
561
700
#[ async_timed_test( timeout_secs = 30 ) ]
562
701
async fn test_cast ( ) {
563
702
let config = hyperactor:: config:: global:: lock ( ) ;
0 commit comments