@@ -50,7 +50,7 @@ var Flags AgentCmdFlags
5050const schemaVersion string = "v2.0.0"
5151
5252// Run starts the agent process
53- func Run (cmd * cobra.Command , args []string ) error {
53+ func Run (cmd * cobra.Command , args []string ) ( returnErr error ) {
5454 ctx , cancel := context .WithCancel (cmd .Context ())
5555 defer cancel ()
5656 log := klog .FromContext (ctx ).WithName ("Run" )
@@ -82,12 +82,7 @@ func Run(cmd *cobra.Command, args []string) error {
8282 group , gctx := errgroup .WithContext (ctx )
8383 defer func () {
8484 cancel ()
85- if groupErr := group .Wait (); groupErr != nil {
86- err = multierror .Append (
87- err ,
88- fmt .Errorf ("failed to wait for controller-runtime component to stop: %v" , groupErr ),
89- )
90- }
85+ returnErr = errors .Join (returnErr , group .Wait ())
9186 }()
9287
9388 {
@@ -123,8 +118,12 @@ func Run(cmd *cobra.Command, args []string) error {
123118 w .WriteHeader (http .StatusOK )
124119 })
125120
126- group .Go (func () error {
127- err := listenAndServe (
121+ group .Go (func () (err error ) {
122+ log .Info ("Starting" )
123+ defer func () {
124+ log .Info ("Stopped" , "reason" , err )
125+ }()
126+ err = listenAndServe (
128127 klog .NewContext (gctx , log ),
129128 & http.Server {
130129 Addr : serverAddress ,
@@ -137,20 +136,28 @@ func Run(cmd *cobra.Command, args []string) error {
137136 if err != nil {
138137 return fmt .Errorf ("APIServer: %s" , err )
139138 }
139+ if err := context .Cause (gctx ); err == nil {
140+ return fmt .Errorf ("APIServer exited unexpectedly" )
141+ }
140142 return nil
141143 })
142144 }
143145
144146 _ , isVenConn := preflightClient .(* client.VenConnClient )
145147 if isVenConn {
146- group .Go (func () error {
147- err := preflightClient .(manager.Runnable ).Start (gctx )
148+ group .Go (func () (err error ) {
149+ log := log .WithName ("VenConnClient" )
150+ log .Info ("Starting" )
151+ defer func () {
152+ log .Info ("Stopped" , "reason" , err )
153+ }()
154+ err = preflightClient .(manager.Runnable ).Start (gctx )
148155 if err != nil {
149156 return fmt .Errorf ("failed to start a controller-runtime component: %v" , err )
150157 }
151-
152- // The agent must stop if the controller-runtime component stops.
153- cancel ()
158+ if err := context . Cause ( gctx ); err == nil {
159+ return fmt . Errorf ( "VenConnClient exited unexpectedly" )
160+ }
154161 return nil
155162 })
156163 }
@@ -177,16 +184,17 @@ func Run(cmd *cobra.Command, args []string) error {
177184 return fmt .Errorf ("failed to instantiate %q data gatherer %q: %v" , kind , dgConfig .Name , err )
178185 }
179186
180- log .Info ("Starting datagatherer" , "gatherer" , dgConfig .Name )
181-
182187 // start the data gatherers and wait for the cache sync
183- group .Go (func () error {
188+ group .Go (func () (err error ) {
189+ log := log .WithName ("DataGatherer.Run" ).WithValues ("DataGatherer.name" , dgConfig .Name )
190+ log .V (1 ).Info ("Starting" )
191+ defer func () {
192+ log .V (1 ).Info ("Stopped" , "reason" , err )
193+ }()
184194 if err := newDg .Run (gctx .Done ()); err != nil {
185195 return fmt .Errorf ("failed to start %q data gatherer %q: %v" , kind , dgConfig .Name , err )
186196 }
187- // The agent must stop if any of the data gatherers stops
188- cancel ()
189- return nil
197+ return context .Cause (gctx )
190198 })
191199
192200 // regardless of success, this dataGatherers has been given a
@@ -225,10 +233,28 @@ func Run(cmd *cobra.Command, args []string) error {
225233 // TODO(wallrj): Pass a context to gatherAndOutputData, so that we don't
226234 // have to wait for it to finish before exiting the process.
227235 for {
228- if err := gatherAndOutputData (ctx , eventf , config , preflightClient , dataGatherers ); err != nil {
229- return err
236+ timeLimit := time .Second * 5
237+ timeoutCTX , cancelTimeout := context .WithTimeoutCause (gctx , time .Second * 5 , fmt .Errorf ("timeout after %s" , timeLimit ))
238+ defer cancelTimeout ()
239+
240+ cancelCTX , cancelCause := context .WithCancelCause (timeoutCTX )
241+ go func () {
242+ err := gatherAndOutputData (cancelCTX , eventf , config , preflightClient , dataGatherers )
243+ cancelCause (err )
244+ }()
245+
246+ select {
247+ case <- cancelCTX .Done ():
248+ err := context .Cause (cancelCTX )
249+ if err != nil && ! errors .Is (err , context .Canceled ) {
250+ return fmt .Errorf ("gatherAndOutputData: %s" , err )
251+ }
252+ case <- timeoutCTX .Done ():
253+ return fmt .Errorf ("gatherAndOutputData: %s" , context .Cause (timeoutCTX ))
230254 }
231255
256+ cancelTimeout ()
257+
232258 if config .OneShot {
233259 break
234260 }
@@ -346,7 +372,7 @@ func gatherData(ctx context.Context, config CombinedConfig, dataGatherers map[st
346372 continue
347373 }
348374
349- log .Info ("Successfully gathered data" , "gatherer" , k , "count" , count )
375+ log .V ( 1 ). Info ("Successfully gathered data" , "gatherer" , k , "count" , count )
350376
351377 readings = append (readings , & api.DataReading {
352378 ClusterID : config .ClusterID ,
0 commit comments