583583 return results , nil
584584}
585585
586+ // MonitorNodeSkipped represents a node whose status was not checked.
587+ type MonitorNodeSkipped struct {}
588+
589+ // MonitorNodeRunning represents the cockroach process running on a
590+ // node.
591+ type MonitorNodeRunning struct {
592+ PID string
593+ }
594+
595+ // MonitorNodeDead represents the cockroach process dying on a node.
596+ type MonitorNodeDead struct {
597+ ExitCode string
598+ }
599+
600+ type MonitorError struct {
601+ Err error
602+ }
603+
586604// NodeMonitorInfo is a message describing a cockroach process' status.
587605type NodeMonitorInfo struct {
588606 // The index of the node (in a SyncedCluster) at which the message originated.
589607 Node Node
590- // A message about the node. This is either a PID, "dead", "nc exited", or
591- // "skipped".
592- // Anything but a PID or "skipped" is an indication that there is some
593- // problem with the node and that the process is not running.
594- Msg string
595- // Err is an error that may occur when trying to probe the status of the node.
596- // If Err is non-nil, Msg is empty. After an error is returned, the node with
597- // the given index will no longer be probed. Errors typically indicate networking
598- // issues or nodes that have (physically) shut down.
599- Err error
608+ // Event describes what happened to the node; it is one of
609+ // MonitorNodeSkipped (no store directory was found);
610+ // MonitorNodeRunning, sent when cockroach is running on a node;
611+ // MonitorNodeDead, when the cockroach process stops running on a
612+ // node; or MonitorError, typically indicate networking issues
613+ // or nodes that have (physically) shut down.
614+ Event interface {}
615+ }
616+
617+ func (nmi NodeMonitorInfo ) String () string {
618+ var status string
619+
620+ switch event := nmi .Event .(type ) {
621+ case MonitorNodeRunning :
622+ status = fmt .Sprintf ("cockroach process is running (PID: %s)" , event .PID )
623+ case MonitorNodeSkipped :
624+ status = "node skipped"
625+ case MonitorNodeDead :
626+ status = fmt .Sprintf ("cockroach process died (exit code %s)" , event .ExitCode )
627+ case MonitorError :
628+ status = fmt .Sprintf ("error: %s" , event .Err .Error ())
629+ }
630+
631+ return fmt .Sprintf ("n%d: %s" , nmi .Node , status )
600632}
601633
602634// MonitorOpts is used to pass the options needed by Monitor.
@@ -606,16 +638,16 @@ type MonitorOpts struct {
606638}
607639
608640// Monitor writes NodeMonitorInfo for the cluster nodes to the returned channel.
609- // Infos sent to the channel always have the Index and exactly one of Msg or Err
610- // set .
641+ // Infos sent to the channel always have the Node the event refers to, and the
642+ // event itself. See documentation for NodeMonitorInfo for possible event types .
611643//
612- // If oneShot is true, infos are retrieved only once for each node and the
644+ // If OneShot is true, infos are retrieved only once for each node and the
613645// channel is subsequently closed; otherwise the process continues indefinitely
614646// (emitting new information as the status of the cockroach process changes).
615647//
616- // If ignoreEmptyNodes is true, nodes on which no CockroachDB data is found
617- // (in {store-dir}) will not be probed and single message, "skipped", will
618- // be emitted for them.
648+ // If IgnoreEmptyNodes is true, nodes on which no CockroachDB data is found
649+ // (in {store-dir}) will not be probed and single event, MonitorNodeSkipped,
650+ // will be emitted for them.
619651func (c * SyncedCluster ) Monitor (
620652 l * logger.Logger , ctx context.Context , opts MonitorOpts ,
621653) chan NodeMonitorInfo {
@@ -624,10 +656,30 @@ func (c *SyncedCluster) Monitor(
624656 var wg sync.WaitGroup
625657 monitorCtx , cancel := context .WithCancel (ctx )
626658
659+ // sendEvent sends the NodeMonitorInfo passed through the channel
660+ // that is listened to by the caller. Bails if the context is
661+ // canceled.
662+ sendEvent := func (info NodeMonitorInfo ) {
663+ select {
664+ case ch <- info :
665+ // We were able to send the info through the channel.
666+ case <- monitorCtx .Done ():
667+ // Don't block trying to send the info.
668+ }
669+ }
670+
671+ const (
672+ separator = "|"
673+ skippedMsg = "skipped"
674+ runningMsg = "running"
675+ deadMsg = "dead"
676+ )
677+
627678 for i := range nodes {
628679 wg .Add (1 )
629680 go func (i int ) {
630681 defer wg .Done ()
682+
631683 node := nodes [i ]
632684
633685 // On each monitored node, we loop looking for a cockroach process.
@@ -637,18 +689,30 @@ func (c *SyncedCluster) Monitor(
637689 Store string
638690 Port int
639691 Local bool
692+ Separator string
693+ SkippedMsg string
694+ RunningMsg string
695+ DeadMsg string
640696 }{
641697 OneShot : opts .OneShot ,
642698 IgnoreEmpty : opts .IgnoreEmptyNodes ,
643699 Store : c .NodeDir (node , 1 /* storeIndex */ ),
644700 Port : c .NodePort (node ),
645701 Local : c .IsLocal (),
702+ Separator : separator ,
703+ SkippedMsg : skippedMsg ,
704+ RunningMsg : runningMsg ,
705+ DeadMsg : deadMsg ,
646706 }
647707
708+ // NB.: we parse the output of every line this script
709+ // prints. Every call to `echo` must match the parsing logic
710+ // down below in order to produce structured results to the
711+ // caller.
648712 snippet := `
649713{{ if .IgnoreEmpty }}
650714if ! ls {{.Store}}/marker.* 1> /dev/null 2>&1; then
651- echo "skipped "
715+ echo "{{.SkippedMsg}} "
652716 exit 0
653717fi
654718{{- end}}
@@ -682,10 +746,10 @@ while :; do
682746 # the new incarnation. We lost the actual exit status of the old PID.
683747 status="unknown"
684748 fi
685- echo "dead (exit status ${status}) "
749+ echo "{{.DeadMsg}}{{.Separator}} ${status}"
686750 fi
687751 if [ "${pid}" != 0 ]; then
688- echo "${pid}"
752+ echo "{{.RunningMsg}}{{.Separator}} ${pid}"
689753 fi
690754 lastpid=${pid}
691755 fi
704768 t := template .Must (template .New ("script" ).Parse (snippet ))
705769 var buf bytes.Buffer
706770 if err := t .Execute (& buf , data ); err != nil {
707- ch <- NodeMonitorInfo {Node : node , Err : err }
771+ err := errors .Wrap (err , "failed to execute template" )
772+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
708773 return
709774 }
710775
@@ -713,14 +778,16 @@ done
713778
714779 p , err := sess .StdoutPipe ()
715780 if err != nil {
716- ch <- NodeMonitorInfo {Node : node , Err : err }
781+ err := errors .Wrap (err , "failed to read stdout pipe" )
782+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
717783 wg .Done ()
718784 return
719785 }
720786 // Request a PTY so that the script will receive a SIGPIPE when the
721787 // session is closed.
722788 if err := sess .RequestPty (); err != nil {
723- ch <- NodeMonitorInfo {Node : node , Err : err }
789+ err := errors .Wrap (err , "failed to request PTY" )
790+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
724791 return
725792 }
726793
@@ -734,12 +801,31 @@ done
734801 if err == io .EOF {
735802 return
736803 }
737- ch <- NodeMonitorInfo {Node : node , Msg : string (line )}
804+ if err != nil {
805+ err := errors .Wrap (err , "error reading from session" )
806+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
807+ }
808+
809+ parts := strings .Split (string (line ), separator )
810+ switch parts [0 ] {
811+ case skippedMsg :
812+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorNodeSkipped {}})
813+ case runningMsg :
814+ pid := parts [1 ]
815+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorNodeRunning {pid }})
816+ case deadMsg :
817+ exitCode := parts [1 ]
818+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorNodeDead {exitCode }})
819+ default :
820+ err := fmt .Errorf ("internal error: unrecognized output from monitor: %s" , line )
821+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
822+ }
738823 }
739824 }(p )
740825
741826 if err := sess .Start (); err != nil {
742- ch <- NodeMonitorInfo {Node : node , Err : err }
827+ err := errors .Wrap (err , "failed to start session" )
828+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
743829 return
744830 }
745831
755841 // pipe. Otherwise it can be closed under us, causing the reader to loop
756842 // infinitely receiving a non-`io.EOF` error.
757843 if err := sess .Wait (); err != nil {
758- ch <- NodeMonitorInfo {Node : node , Err : err }
844+ err := errors .Wrap (err , "failed to wait for session" )
845+ sendEvent (NodeMonitorInfo {Node : node , Event : MonitorError {err }})
759846 return
760847 }
761848 }(i )
0 commit comments