@@ -22,12 +22,15 @@ use chrono::DateTime;
22
22
use chrono:: Local ;
23
23
use hyperactor:: Actor ;
24
24
use hyperactor:: ActorRef ;
25
+ use hyperactor:: Bind ;
25
26
use hyperactor:: Context ;
26
27
use hyperactor:: HandleClient ;
27
28
use hyperactor:: Handler ;
28
29
use hyperactor:: Instance ;
29
30
use hyperactor:: Named ;
31
+ use hyperactor:: OncePortRef ;
30
32
use hyperactor:: RefClient ;
33
+ use hyperactor:: Unbind ;
31
34
use hyperactor:: channel;
32
35
use hyperactor:: channel:: ChannelAddr ;
33
36
use hyperactor:: channel:: ChannelRx ;
@@ -39,9 +42,6 @@ use hyperactor::channel::TxStatus;
39
42
use hyperactor:: clock:: Clock ;
40
43
use hyperactor:: clock:: RealClock ;
41
44
use hyperactor:: data:: Serialized ;
42
- use hyperactor:: message:: Bind ;
43
- use hyperactor:: message:: Bindings ;
44
- use hyperactor:: message:: Unbind ;
45
45
use hyperactor_telemetry:: env;
46
46
use hyperactor_telemetry:: log_file_path;
47
47
use serde:: Deserialize ;
@@ -235,6 +235,24 @@ impl fmt::Display for Aggregator {
235
235
}
236
236
}
237
237
238
+ /// Messages that can be sent to the LogClientActor remotely.
239
+ #[ derive(
240
+ Debug ,
241
+ Clone ,
242
+ Serialize ,
243
+ Deserialize ,
244
+ Named ,
245
+ Handler ,
246
+ HandleClient ,
247
+ RefClient ,
248
+ Bind ,
249
+ Unbind
250
+ ) ]
251
+ pub enum LogFlushMessage {
252
+ /// Flush the log
253
+ ForceSyncFlush { } ,
254
+ }
255
+
238
256
/// Messages that can be sent to the LogClientActor remotely.
239
257
#[ derive(
240
258
Debug ,
@@ -260,7 +278,10 @@ pub enum LogMessage {
260
278
} ,
261
279
262
280
/// Flush the log
263
- Flush { } ,
281
+ Flush {
282
+ /// If true, force a flush sync barrier across all procs
283
+ synced : bool ,
284
+ } ,
264
285
}
265
286
266
287
/// Messages that can be sent to the LogClient locally.
@@ -279,6 +300,14 @@ pub enum LogClientMessage {
279
300
/// The time window in seconds to aggregate logs. If None, aggregation is disabled.
280
301
aggregate_window_sec : Option < u64 > ,
281
302
} ,
303
+
304
+ /// Synchronously flush all the logs from all the procs. This is for client to call.
305
+ StartSyncFlush {
306
+ /// Expect these many procs to ack the flush message.
307
+ expected_procs : usize ,
308
+ /// Return once we have received the acks from all the procs
309
+ reply : OncePortRef < ( ) > ,
310
+ } ,
282
311
}
283
312
284
313
/// Trait for sending logs
@@ -352,7 +381,7 @@ impl LogSender for LocalLogSender {
352
381
// send will make sure message is delivered
353
382
if TxStatus :: Active == * self . status . borrow ( ) {
354
383
// Do not use tx.send, it will block the allocator as the child process state is unknown.
355
- self . tx . post ( LogMessage :: Flush { } ) ;
384
+ self . tx . post ( LogMessage :: Flush { synced : false } ) ;
356
385
} else {
357
386
tracing:: debug!(
358
387
"log sender {} is not active, skip sending flush message" ,
@@ -547,7 +576,9 @@ impl<T: LogSender + Unpin + 'static, S: io::AsyncWrite + Send + Unpin + 'static>
547
576
Named ,
548
577
Handler ,
549
578
HandleClient ,
550
- RefClient
579
+ RefClient ,
580
+ Bind ,
581
+ Unbind
551
582
) ]
552
583
pub enum LogForwardMessage {
553
584
/// Receive the log from the parent process and forward ti to the client.
@@ -557,18 +588,6 @@ pub enum LogForwardMessage {
557
588
SetMode { stream_to_client : bool } ,
558
589
}
559
590
560
- impl Bind for LogForwardMessage {
561
- fn bind ( & mut self , _bindings : & mut Bindings ) -> anyhow:: Result < ( ) > {
562
- Ok ( ( ) )
563
- }
564
- }
565
-
566
- impl Unbind for LogForwardMessage {
567
- fn unbind ( & self , _bindings : & mut Bindings ) -> anyhow:: Result < ( ) > {
568
- Ok ( ( ) )
569
- }
570
- }
571
-
572
591
/// A log forwarder that receives the log from its parent process and forward it back to the client
573
592
#[ derive( Debug ) ]
574
593
#[ hyperactor:: export(
@@ -636,17 +655,28 @@ impl Actor for LogForwardActor {
636
655
#[ hyperactor:: forward( LogForwardMessage ) ]
637
656
impl LogForwardMessageHandler for LogForwardActor {
638
657
async fn forward ( & mut self , ctx : & Context < Self > ) -> Result < ( ) , anyhow:: Error > {
639
- if let Ok ( LogMessage :: Log {
640
- hostname,
641
- pid,
642
- output_target,
643
- payload,
644
- } ) = self . rx . recv ( ) . await
645
- {
646
- if self . stream_to_client {
647
- self . logging_client_ref
648
- . log ( ctx, hostname, pid, output_target, payload)
649
- . await ?;
658
+ match self . rx . recv ( ) . await {
659
+ Ok ( LogMessage :: Flush { synced } ) => {
660
+ if synced {
661
+ self . logging_client_ref . flush ( ctx, true ) . await ?;
662
+ } else {
663
+ // no need to do anything. The previous messages have already been sent to the client.
664
+ }
665
+ }
666
+ Ok ( LogMessage :: Log {
667
+ hostname,
668
+ pid,
669
+ output_target,
670
+ payload,
671
+ } ) => {
672
+ if self . stream_to_client {
673
+ self . logging_client_ref
674
+ . log ( ctx, hostname, pid, output_target, payload)
675
+ . await ?;
676
+ }
677
+ }
678
+ Err ( e) => {
679
+ return Err ( e. into ( ) ) ;
650
680
}
651
681
}
652
682
@@ -685,6 +715,54 @@ fn deserialize_message_lines(
685
715
anyhow:: bail!( "Failed to deserialize message as either String or Vec<u8>" )
686
716
}
687
717
718
+ /// An actor that send flush message to the log forwarder actor.
719
+ /// The reason we need an extra actor instead of reusing the log forwarder actor
720
+ /// is because the log forwarder can be blocked on the rx.recv() that listens on the new log lines.
721
+ /// Thus, we need to create anew channel as a tx to send the flush message to the log forwarder
722
+ /// So we do not get into a deadlock.
723
+ #[ derive( Debug ) ]
724
+ #[ hyperactor:: export(
725
+ spawn = true ,
726
+ handlers = [ LogFlushMessage { cast = true } ] ,
727
+ ) ]
728
+ pub struct LogFlushActor {
729
+ tx : ChannelTx < LogMessage > ,
730
+ }
731
+
732
+ #[ async_trait]
733
+ impl Actor for LogFlushActor {
734
+ type Params = ( ) ;
735
+
736
+ async fn new ( _: ( ) ) -> Result < Self , anyhow:: Error > {
737
+ let log_channel: ChannelAddr = match std:: env:: var ( BOOTSTRAP_LOG_CHANNEL ) {
738
+ Ok ( channel) => channel. parse ( ) ?,
739
+ Err ( err) => {
740
+ tracing:: debug!(
741
+ "log forwarder actor failed to read env var {}: {}" ,
742
+ BOOTSTRAP_LOG_CHANNEL ,
743
+ err
744
+ ) ;
745
+ // TODO: this should error out; it can only happen with local proc; we need to fix it.
746
+ ChannelAddr :: any ( ChannelTransport :: Unix )
747
+ }
748
+ } ;
749
+ let tx = channel:: dial :: < LogMessage > ( log_channel) ?;
750
+
751
+ Ok ( Self { tx } )
752
+ }
753
+ }
754
+
755
+ #[ async_trait]
756
+ #[ hyperactor:: forward( LogFlushMessage ) ]
757
+ impl LogFlushMessageHandler for LogFlushActor {
758
+ async fn force_sync_flush ( & mut self , _cx : & Context < Self > ) -> Result < ( ) , anyhow:: Error > {
759
+ self . tx
760
+ . send ( LogMessage :: Flush { synced : true } )
761
+ . await
762
+ . map_err ( anyhow:: Error :: from)
763
+ }
764
+ }
765
+
688
766
/// A client to receive logs from remote processes
689
767
#[ derive( Debug ) ]
690
768
#[ hyperactor:: export(
@@ -696,6 +774,8 @@ pub struct LogClientActor {
696
774
aggregators : HashMap < OutputTarget , Aggregator > ,
697
775
last_flush_time : SystemTime ,
698
776
next_flush_deadline : Option < SystemTime > ,
777
+ ongoing_flush_port : Option < OncePortRef < ( ) > > ,
778
+ unflushed_procs : usize ,
699
779
}
700
780
701
781
impl LogClientActor {
@@ -725,6 +805,12 @@ impl LogClientActor {
725
805
OutputTarget :: Stderr => eprintln ! ( "{}" , message) ,
726
806
}
727
807
}
808
+
809
+ fn flush_internal ( & mut self ) {
810
+ self . print_aggregators ( ) ;
811
+ self . last_flush_time = RealClock . system_time_now ( ) ;
812
+ self . next_flush_deadline = None ;
813
+ }
728
814
}
729
815
730
816
#[ async_trait]
@@ -743,6 +829,8 @@ impl Actor for LogClientActor {
743
829
aggregators,
744
830
last_flush_time : RealClock . system_time_now ( ) ,
745
831
next_flush_deadline : None ,
832
+ ongoing_flush_port : None ,
833
+ unflushed_procs : 0 ,
746
834
} )
747
835
}
748
836
}
@@ -794,20 +882,23 @@ impl LogMessageHandler for LogClientActor {
794
882
let new_deadline = self . last_flush_time + Duration :: from_secs ( window) ;
795
883
let now = RealClock . system_time_now ( ) ;
796
884
if new_deadline <= now {
797
- self . flush ( cx ) . await ? ;
885
+ self . flush_internal ( ) ;
798
886
} else {
799
887
let delay = new_deadline. duration_since ( now) ?;
800
888
match self . next_flush_deadline {
801
889
None => {
802
890
self . next_flush_deadline = Some ( new_deadline) ;
803
- cx. self_message_with_delay ( LogMessage :: Flush { } , delay) ?;
891
+ cx. self_message_with_delay ( LogMessage :: Flush { synced : false } , delay) ?;
804
892
}
805
893
Some ( deadline) => {
806
894
// Some early log lines have alrady triggered the flush.
807
895
if new_deadline < deadline {
808
896
// This can happen if the user has adjusted the aggregation window.
809
897
self . next_flush_deadline = Some ( new_deadline) ;
810
- cx. self_message_with_delay ( LogMessage :: Flush { } , delay) ?;
898
+ cx. self_message_with_delay (
899
+ LogMessage :: Flush { synced : false } ,
900
+ delay,
901
+ ) ?;
811
902
}
812
903
}
813
904
}
@@ -818,10 +909,21 @@ impl LogMessageHandler for LogClientActor {
818
909
Ok ( ( ) )
819
910
}
820
911
821
- async fn flush ( & mut self , _cx : & Context < Self > ) -> Result < ( ) , anyhow:: Error > {
822
- self . print_aggregators ( ) ;
823
- self . last_flush_time = RealClock . system_time_now ( ) ;
824
- self . next_flush_deadline = None ;
912
+ async fn flush ( & mut self , cx : & Context < Self > , synced : bool ) -> Result < ( ) , anyhow:: Error > {
913
+ if synced {
914
+ if self . unflushed_procs == 0 || self . ongoing_flush_port . is_none ( ) {
915
+ anyhow:: bail!( "found no ongoing flush request" ) ;
916
+ }
917
+ self . unflushed_procs -= 1 ;
918
+ if self . unflushed_procs == 0 {
919
+ self . flush_internal ( ) ;
920
+ let reply = self . ongoing_flush_port . take ( ) . unwrap ( ) ;
921
+ self . ongoing_flush_port = None ;
922
+ reply. send ( cx, ( ) ) . map_err ( anyhow:: Error :: from) ?;
923
+ }
924
+ } else {
925
+ self . flush_internal ( ) ;
926
+ }
825
927
826
928
Ok ( ( ) )
827
929
}
@@ -842,6 +944,21 @@ impl LogClientMessageHandler for LogClientActor {
842
944
self . aggregate_window_sec = aggregate_window_sec;
843
945
Ok ( ( ) )
844
946
}
947
+
948
+ async fn start_sync_flush (
949
+ & mut self ,
950
+ _cx : & Context < Self > ,
951
+ expected_procs_flushed : usize ,
952
+ reply : OncePortRef < ( ) > ,
953
+ ) -> Result < ( ) , anyhow:: Error > {
954
+ if self . unflushed_procs > 0 || self . ongoing_flush_port . is_some ( ) {
955
+ anyhow:: bail!( "forcing a flush while the ongoing flush has not finished yet" ) ;
956
+ }
957
+
958
+ self . ongoing_flush_port = Some ( reply. clone ( ) ) ;
959
+ self . unflushed_procs = expected_procs_flushed;
960
+ Ok ( ( ) )
961
+ }
845
962
}
846
963
847
964
#[ cfg( test) ]
0 commit comments