@@ -43,8 +43,11 @@ const (
4343 // maxCheckinMisses is the maximum number of check-in misses a component can miss before it is killed
4444 // and restarted.
4545 maxCheckinMisses = 3
46- // diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit.
47- diagnosticTimeout = time .Minute
46+ // diagnosticTimeoutCPU is the maximum amount of time to wait for a diagnostic response from a unit while collecting CPU profiles
47+ diagnosticTimeoutCPU = time .Minute
48+
49+ // diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit
50+ diagnosticTimeout = time .Second * 20
4851
4952 // stopCheckRetryPeriod is a idle time between checks for component stopped state
5053 stopCheckRetryPeriod = 200 * time .Millisecond
@@ -944,7 +947,15 @@ func (m *Manager) getListenAddr() string {
944947// performDiagAction creates a diagnostic ActionRequest and executes it against the runtime that's mapped to the specified component.
945948// if the specified actionLevel is ActionRequest_COMPONENT, the unit field is ignored.
946949func (m * Manager ) performDiagAction (ctx context.Context , comp component.Component , unit component.Unit , actionLevel proto.ActionRequest_Level , params client.DiagnosticParams ) ([]* proto.ActionDiagnosticUnitResult , error ) {
947- ctx , cancel := context .WithTimeout (ctx , diagnosticTimeout )
950+ // if we're gathering CPU diagnostics, request a longer timeout; CPU diag collection requires the diagnostic hook to sit and gather a CPU profile.
951+ finalDiagnosticTime := diagnosticTimeout
952+ for _ , tag := range params .AdditionalMetrics {
953+ if tag == "CPU" {
954+ finalDiagnosticTime = diagnosticTimeoutCPU
955+ break
956+ }
957+ }
958+ ctx , cancel := context .WithTimeout (ctx , finalDiagnosticTime )
948959 defer cancel ()
949960
950961 id , err := uuid .NewV4 ()
@@ -966,7 +977,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
966977 }
967978
968979 if len (params .AdditionalMetrics ) > 0 {
969- m .logger .Debugf ("Performing diagnostic action with params: %v" , params .AdditionalMetrics )
980+ m .logger .Debugf ("Performing diagnostic action with params: %v; will wait %s " , params .AdditionalMetrics , finalDiagnosticTime )
970981 }
971982 marshalParams , err := json .Marshal (params )
972983 if err != nil {
@@ -989,7 +1000,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
9891000 // the only way this can return an error is a context Done(), be sure to make that explicit.
9901001 if err != nil {
9911002 if errors .Is (context .DeadlineExceeded , err ) {
992- return nil , fmt .Errorf ("diagnostic action timed out, deadline is %s: %w" , diagnosticTimeout , err )
1003+ return nil , fmt .Errorf ("diagnostic action timed out, deadline is %s: %w" , finalDiagnosticTime , err )
9931004 }
9941005 return nil , fmt .Errorf ("error running performAction: %w" , err )
9951006 }
0 commit comments