Use different timeout values for CPU and non-CPU diagnostic requests (elastic#3794)

fearful-symmetry · pierrehilbert · web-flow · commit 3afa073fdc44 · 2023-11-27T08:17:12.000-08:00
* use different timeout values for diagnostics

* update readme, add break

---------

Co-authored-by: Pierre HILBERT &lt;pierre.hilbert@elastic.co&gt;
diff --git a/changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml b/changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: enhancement
+
+# Change summary; a 80ish characters long description of the change.
+summary: Use shorter timeouts for diagnostic requests unless CPU diagnostics are requested
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+description: Use different timeout values for diagnostics requests, depending on if CPU diagnostics are requested
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: diagnostics
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+#pr: https://github.com/owner/repo/1234
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+issue: https://github.com/elastic/elastic-agent/issues/3197
diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go
@@ -43,8 +43,11 @@ const (
 	// maxCheckinMisses is the maximum number of check-in misses a component can miss before it is killed
 	// and restarted.
 	maxCheckinMisses = 3
-	// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit.
-	diagnosticTimeout = time.Minute
+	// diagnosticTimeoutCPU is the maximum amount of time to wait for a diagnostic response from a unit while collecting CPU profiles
+	diagnosticTimeoutCPU = time.Minute
+
+	// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit
+	diagnosticTimeout = time.Second * 20
 
 	// stopCheckRetryPeriod is a idle time between checks for component stopped state
 	stopCheckRetryPeriod = 200 * time.Millisecond
@@ -944,7 +947,15 @@ func (m *Manager) getListenAddr() string {
 // performDiagAction creates a diagnostic ActionRequest and executes it against the runtime that's mapped to the specified component.
 // if the specified actionLevel is ActionRequest_COMPONENT, the unit field is ignored.
 func (m *Manager) performDiagAction(ctx context.Context, comp component.Component, unit component.Unit, actionLevel proto.ActionRequest_Level, params client.DiagnosticParams) ([]*proto.ActionDiagnosticUnitResult, error) {
-	ctx, cancel := context.WithTimeout(ctx, diagnosticTimeout)
+	// if we're gathering CPU diagnostics, request a longer timeout; CPU diag collection requires the diagnostic hook to sit and gather a CPU profile.
+	finalDiagnosticTime := diagnosticTimeout
+	for _, tag := range params.AdditionalMetrics {
+		if tag == "CPU" {
+			finalDiagnosticTime = diagnosticTimeoutCPU
+			break
+		}
+	}
+	ctx, cancel := context.WithTimeout(ctx, finalDiagnosticTime)
 	defer cancel()
 
 	id, err := uuid.NewV4()
@@ -966,7 +977,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
 	}
 
 	if len(params.AdditionalMetrics) > 0 {
-		m.logger.Debugf("Performing diagnostic action with params: %v", params.AdditionalMetrics)
+		m.logger.Debugf("Performing diagnostic action with params: %v; will wait %s", params.AdditionalMetrics, finalDiagnosticTime)
 	}
 	marshalParams, err := json.Marshal(params)
 	if err != nil {
@@ -989,7 +1000,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
 	// the only way this can return an error is a context Done(), be sure to make that explicit.
 	if err != nil {
 		if errors.Is(context.DeadlineExceeded, err) {
-			return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", diagnosticTimeout, err)
+			return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", finalDiagnosticTime, err)
 		}
 		return nil, fmt.Errorf("error running performAction: %w", err)
 	}