@@ -72,6 +72,8 @@ type Monitor struct {
7272
7373 // Limit for items per pagination query
7474 queryLimit int
75+ // Limit for goroutines per cluster Monitor instance
76+ parallelism int
7577}
7678
7779func NewMonitor (log * logrus.Entry , restConfig * rest.Config , oc * api.OpenShiftCluster , env env.Interface , tenantID string , m metrics.Emitter , hourlyRun bool ) (monitoring.Monitor , error ) {
@@ -160,6 +162,7 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu
160162 ocpclientset : clienthelper .NewWithClient (log , ocpclientset ),
161163 namespacesToMonitor : []string {},
162164 queryLimit : 50 ,
165+ parallelism : MONITOR_GOROUTINES_PER_CLUSTER ,
163166 }
164167 mon .collectors = []collectorFunc {
165168 mon .emitAroOperatorHeartbeat ,
@@ -196,8 +199,16 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu
196199}
197200
198201func (mon * Monitor ) timeCall (ctx context.Context , f func (context.Context ) error ) (err error ) {
199- innerNow := time .Now ()
200202 collectorName := steps .ShortName (f )
203+
204+ // Don't run collectors if we have already timed out
205+ if ctx .Err () != nil {
206+ mon .log .Debugf ("skipping %s because %s:" , collectorName , ctx .Err ())
207+ mon .emitMonitorCollectorSkipped (collectorName )
208+ return & failureToRunClusterCollector {collectorName : collectorName , inner : ctx .Err ()}
209+ }
210+
211+ innerNow := time .Now ()
201212 mon .log .Debugf ("running %s" , collectorName )
202213
203214 // If the collector panics we should return the error (so that it bubbles
@@ -272,10 +283,10 @@ func (mon *Monitor) Monitor(ctx context.Context) (_err error) {
272283 return errors .Join (errs ... )
273284 }
274285
275- // Run up to MONITOR_GOROUTINES_PER_CLUSTER goroutines for collecting
276- // metrics
286+ // Run up to mon.parallelism (default: MONITOR_GOROUTINES_PER_CLUSTER)
287+ // goroutines for collecting metrics
277288 wg := new (errgroup.Group )
278- wg .SetLimit (MONITOR_GOROUTINES_PER_CLUSTER )
289+ wg .SetLimit (mon . parallelism )
279290
280291 // Create a channel capable of buffering one error from every collector
281292 errChan := make (chan error , len (mon .collectors ))
@@ -305,7 +316,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (_err error) {
305316 }
306317
307318 // emit a metric with how long we took when we have no errors
308- if len (errs ) == 0 {
319+ if len (errs ) == 0 && ctx . Err () == nil {
309320 mon .emitFloat ("monitor.cluster.duration" , time .Since (now ).Seconds (), map [string ]string {})
310321 }
311322
@@ -316,6 +327,10 @@ func (mon *Monitor) emitMonitorCollectorError(collectorName string) {
316327 emitter .EmitGauge (mon .m , "monitor.cluster.collector.error" , 1 , mon .dims , map [string ]string {"collector" : collectorName })
317328}
318329
330+ func (mon * Monitor ) emitMonitorCollectorSkipped (collectorName string ) {
331+ emitter .EmitGauge (mon .m , "monitor.cluster.collector.skipped" , 1 , mon .dims , map [string ]string {"collector" : collectorName })
332+ }
333+
319334func (mon * Monitor ) emitMonitorCollectionTiming (collectorName string , duration float64 ) {
320335 emitter .EmitFloat (mon .m , "monitor.cluster.collector.duration" , duration , mon .dims , map [string ]string {"collector" : collectorName })
321336}
0 commit comments