Skip to content

Commit 3afa073

Browse files
Use different timeout values for CPU and non-CPU diagnostic requests (elastic#3794)
* use different timeout values for diagnostics * update readme, add break --------- Co-authored-by: Pierre HILBERT <[email protected]>
1 parent 80c29c0 commit 3afa073

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: enhancement
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Use shorter timeouts for diagnostic requests unless CPU diagnostics are requested
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
description: Use different timeout values for diagnostics requests, depending on if CPU diagnostics are requested
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: diagnostics
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
#pr: https://github.com/owner/repo/1234
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/3197

pkg/component/runtime/manager.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,11 @@ const (
4343
// maxCheckinMisses is the maximum number of check-in misses a component can miss before it is killed
4444
// and restarted.
4545
maxCheckinMisses = 3
46-
// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit.
47-
diagnosticTimeout = time.Minute
46+
// diagnosticTimeoutCPU is the maximum amount of time to wait for a diagnostic response from a unit while collecting CPU profiles
47+
diagnosticTimeoutCPU = time.Minute
48+
49+
// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit
50+
diagnosticTimeout = time.Second * 20
4851

4952
// stopCheckRetryPeriod is a idle time between checks for component stopped state
5053
stopCheckRetryPeriod = 200 * time.Millisecond
@@ -944,7 +947,15 @@ func (m *Manager) getListenAddr() string {
944947
// performDiagAction creates a diagnostic ActionRequest and executes it against the runtime that's mapped to the specified component.
945948
// if the specified actionLevel is ActionRequest_COMPONENT, the unit field is ignored.
946949
func (m *Manager) performDiagAction(ctx context.Context, comp component.Component, unit component.Unit, actionLevel proto.ActionRequest_Level, params client.DiagnosticParams) ([]*proto.ActionDiagnosticUnitResult, error) {
947-
ctx, cancel := context.WithTimeout(ctx, diagnosticTimeout)
950+
// if we're gathering CPU diagnostics, request a longer timeout; CPU diag collection requires the diagnostic hook to sit and gather a CPU profile.
951+
finalDiagnosticTime := diagnosticTimeout
952+
for _, tag := range params.AdditionalMetrics {
953+
if tag == "CPU" {
954+
finalDiagnosticTime = diagnosticTimeoutCPU
955+
break
956+
}
957+
}
958+
ctx, cancel := context.WithTimeout(ctx, finalDiagnosticTime)
948959
defer cancel()
949960

950961
id, err := uuid.NewV4()
@@ -966,7 +977,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
966977
}
967978

968979
if len(params.AdditionalMetrics) > 0 {
969-
m.logger.Debugf("Performing diagnostic action with params: %v", params.AdditionalMetrics)
980+
m.logger.Debugf("Performing diagnostic action with params: %v; will wait %s", params.AdditionalMetrics, finalDiagnosticTime)
970981
}
971982
marshalParams, err := json.Marshal(params)
972983
if err != nil {
@@ -989,7 +1000,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
9891000
// the only way this can return an error is a context Done(), be sure to make that explicit.
9901001
if err != nil {
9911002
if errors.Is(context.DeadlineExceeded, err) {
992-
return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", diagnosticTimeout, err)
1003+
return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", finalDiagnosticTime, err)
9931004
}
9941005
return nil, fmt.Errorf("error running performAction: %w", err)
9951006
}

0 commit comments

Comments
 (0)