@@ -181,7 +181,7 @@ func NewOTelManager(
181181 beatMonitoringConfigGetter : beatMonitoringConfigGetter ,
182182 errCh : make (chan error , 1 ), // holds at most one error
183183 collectorStatusCh : make (chan * status.AggregateStatus , 1 ),
184- componentStateCh : make (chan []runtime.ComponentComponentState , 1 ),
184+ componentStateCh : make (chan []runtime.ComponentComponentState ),
185185 updateCh : make (chan configUpdate , 1 ),
186186 doneChan : make (chan struct {}),
187187 execution : exec ,
@@ -241,10 +241,6 @@ func (m *OTelManager) Run(ctx context.Context) error {
241241 if m .proc != nil {
242242 m .proc .Stop (m .stopTimeout )
243243 m .proc = nil
244- updateErr := m .reportOtelStatusUpdate (ctx , nil )
245- if updateErr != nil {
246- reportErr (ctx , m .errCh , updateErr )
247- }
248244 }
249245
250246 if m .mergedCollectorCfg == nil {
@@ -284,12 +280,6 @@ func (m *OTelManager) Run(ctx context.Context) error {
284280 if m .proc != nil {
285281 m .proc .Stop (m .stopTimeout )
286282 m .proc = nil
287- // don't wait here for <-collectorRunErr, already occurred
288- // clear status, no longer running
289- updateErr := m .reportOtelStatusUpdate (ctx , nil )
290- if updateErr != nil {
291- err = errors .Join (err , updateErr )
292- }
293283 }
294284 // pass the error to the errCh so the coordinator, unless it's a cancel error
295285 if ! errors .Is (err , context .Canceled ) {
@@ -429,30 +419,19 @@ func (m *OTelManager) applyMergedConfig(ctx context.Context, collectorStatusCh c
429419 if m .proc != nil {
430420 m .proc .Stop (m .stopTimeout )
431421 m .proc = nil
422+ // We wait here for the collector to exit before possibly starting a new one. The execution indicates this
423+ // by sending an error over the appropriate channel. It will also send a nil status that we'll either process
424+ // after exiting from this function and going back to the main loop, or it will be overridden by the status
425+ // from the newly started collector.
426+ // This is the only blocking wait inside the main loop involving channels, so we need to be extra careful not to
427+ // deadlock.
428+ // TODO: Verify if we need to wait for the error at all. Stop() is already blocking.
432429 select {
433430 case <- collectorRunErr :
434431 case <- ctx .Done ():
435432 // our caller ctx is Done
436433 return ctx .Err ()
437434 }
438- // drain the internal status update channel
439- // this status handling is normally done in the main loop, but in this case we want to ensure that we emit a
440- // nil status after the collector has stopped
441- select {
442- case statusCh := <- collectorStatusCh :
443- updateErr := m .reportOtelStatusUpdate (ctx , statusCh )
444- if updateErr != nil {
445- m .logger .Error ("failed to update otel status" , zap .Error (updateErr ))
446- }
447- case <- ctx .Done ():
448- // our caller ctx is Done
449- return ctx .Err ()
450- default :
451- }
452- err := m .reportOtelStatusUpdate (ctx , nil )
453- if err != nil {
454- return err
455- }
456435 }
457436
458437 if m .mergedCollectorCfg == nil {
@@ -625,18 +604,10 @@ func (m *OTelManager) maybeUpdateMergedConfig(mergedCfg *confmap.Conf) (updated
625604 return ! bytes .Equal (mergedCfgHash , previousConfigHash ) || err != nil , err
626605}
627606
628- // reportComponentStateUpdates sends component state updates to the component watch channel. It first drains
629- // the channel to ensure that only the most recent status is kept, as intermediate statuses can be safely discarded.
630- // This ensures the receiver always observes the latest reported status .
607+ // reportComponentStateUpdates sends component state updates to the component watch channel. It is synchronous and
608+ // blocking - the update must be received before this function returns. We are not allowed to drop older updates
609+ // in favor of newer ones here, as the coordinator expected incremental updates .
631610func (m * OTelManager ) reportComponentStateUpdates (ctx context.Context , componentUpdates []runtime.ComponentComponentState ) {
632- select {
633- case <- ctx .Done ():
634- // context is already done
635- return
636- case <- m .componentStateCh :
637- // drain the channel first
638- default :
639- }
640611 select {
641612 case m .componentStateCh <- componentUpdates :
642613 case <- ctx .Done ():
0 commit comments