Skip to content

Commit 6217a92

Browse files
Fix flaky integration tests for the monitoring endpoint (#10004) (#10017)
* Fix monitoring reloading tests Ensure we verify the policy update was actually applied before querying the monitoring endpoint. Add a retry on said request as well. * Use the explicit policy revision returned by Kibana (cherry picked from commit 5372f24) Co-authored-by: Mikołaj Świątek <[email protected]>
1 parent c7a11ac commit 6217a92

File tree

2 files changed

+38
-33
lines changed

2 files changed

+38
-33
lines changed

testing/integration/ess/monitoring_probe_preserve_text_cfg_test.go

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,13 @@
77
package ess
88

99
import (
10-
"bytes"
1110
"context"
12-
"encoding/json"
13-
"fmt"
1411
"net/http"
1512
"testing"
1613
"time"
1714

1815
"github.com/gofrs/uuid/v5"
16+
"github.com/stretchr/testify/assert"
1917
"github.com/stretchr/testify/require"
2018
"github.com/stretchr/testify/suite"
2119

@@ -67,6 +65,7 @@ type MonitoringTextRunner struct {
6765
suite.Suite
6866
info *define.Info
6967
agentFixture *atesting.Fixture
68+
agentID string
7069

7170
ESHost string
7271

@@ -121,11 +120,12 @@ func (runner *MonitoringTextRunner) SetupSuite() {
121120
err = runner.agentFixture.WriteFileToWorkDir(ctx, defaultTextCfg, "elastic-agent.yml")
122121
require.NoError(runner.T(), err)
123122

124-
policyResp, _, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy)
123+
policyResp, agentID, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy)
125124
require.NoError(runner.T(), err)
126125

127126
runner.policyID = policyResp.ID
128127
runner.policyName = basePolicy.Name
128+
runner.agentID = agentID
129129

130130
_, err = tools.InstallPackageFromDefaultFile(ctx, runner.info.KibanaClient, "system",
131131
integration.PreinstalledPackages["system"], "testdata/system_integration_setup.json", uuid.Must(uuid.NewV4()).String(), policyResp.ID)
@@ -149,10 +149,10 @@ func (runner *MonitoringTextRunner) TestMonitoringLiveness() {
149149
require.Equal(runner.T(), http.StatusOK, initResp.StatusCode)
150150

151151
// use the fleet override API to change the port that we're running on.
152-
override := map[string]interface{}{
153-
"name": runner.policyName,
154-
"namespace": "default",
155-
"overrides": map[string]interface{}{
152+
overrideUpdateRequest := kibana.AgentPolicyUpdateRequest{
153+
Name: runner.policyName,
154+
Namespace: "default",
155+
Overrides: map[string]interface{}{
156156
"agent": map[string]interface{}{
157157
"monitoring": map[string]interface{}{
158158
"http": map[string]interface{}{
@@ -165,20 +165,23 @@ func (runner *MonitoringTextRunner) TestMonitoringLiveness() {
165165
},
166166
}
167167

168-
raw, err := json.Marshal(override)
168+
policyResponse, err := runner.info.KibanaClient.UpdatePolicy(ctx, runner.policyID, overrideUpdateRequest)
169169
require.NoError(runner.T(), err)
170-
reader := bytes.NewBuffer(raw)
171-
overrideEndpoint := fmt.Sprintf("/api/fleet/agent_policies/%s", runner.policyID)
172-
statusCode, overrideResp, err := runner.info.KibanaClient.Request("PUT", overrideEndpoint, nil, nil, reader)
173-
require.NoError(runner.T(), err)
174-
require.Equal(runner.T(), http.StatusOK, statusCode, "non-200 status code; got response: %s", string(overrideResp))
170+
171+
// verify the new policy revision was applied
172+
require.Eventually(
173+
runner.T(),
174+
tools.IsPolicyRevision(ctx, runner.T(), runner.info.KibanaClient, runner.agentID, policyResponse.Revision),
175+
5*time.Minute, time.Second)
175176

176177
runner.AllComponentsHealthy(ctx)
177178

178179
updatedEndpoint := "http://localhost:6792/processes"
179180
// second stage: ensure the HTTP config has updated
180-
req, err = http.NewRequestWithContext(ctx, "GET", updatedEndpoint, nil)
181-
require.NoError(runner.T(), err)
181+
require.EventuallyWithT(runner.T(), func(collect *assert.CollectT) {
182+
req, err = http.NewRequestWithContext(ctx, "GET", updatedEndpoint, nil)
183+
require.NoError(collect, err)
184+
}, time.Minute, time.Second)
182185

183186
initResp, err = client.Do(req)
184187
require.NoError(runner.T(), err)

testing/integration/ess/monitoring_probe_reload_test.go

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@
77
package ess
88

99
import (
10-
"bytes"
1110
"context"
12-
"encoding/json"
1311
"fmt"
1412
"net/http"
1513
"testing"
1614
"time"
1715

1816
"github.com/gofrs/uuid/v5"
17+
"github.com/stretchr/testify/assert"
1918
"github.com/stretchr/testify/require"
2019
"github.com/stretchr/testify/suite"
2120

@@ -31,6 +30,7 @@ type MonitoringRunner struct {
3130
suite.Suite
3231
info *define.Info
3332
agentFixture *atesting.Fixture
33+
agentID string
3434

3535
ESHost string
3636

@@ -81,11 +81,12 @@ func (runner *MonitoringRunner) SetupSuite() {
8181
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
8282
defer cancel()
8383

84-
policyResp, _, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy)
84+
policyResp, agentID, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy)
8585
require.NoError(runner.T(), err)
8686

8787
runner.policyID = policyResp.ID
8888
runner.policyName = basePolicy.Name
89+
runner.agentID = agentID
8990

9091
_, err = tools.InstallPackageFromDefaultFile(ctx, runner.info.KibanaClient, "system",
9192
integration.PreinstalledPackages["system"], "testdata/system_integration_setup.json", uuid.Must(uuid.NewV4()).String(), policyResp.ID)
@@ -107,12 +108,10 @@ func (runner *MonitoringRunner) TestMonitoringLiveness() {
107108
_, err = client.Do(req)
108109
require.Error(runner.T(), err)
109110

110-
// use the fleet override API to enable http monitoring.
111-
// This tests both the http server itself, and tests that the agent reloader actually reloads the agent config.
112-
override := map[string]interface{}{
113-
"name": runner.policyName,
114-
"namespace": "default",
115-
"overrides": map[string]interface{}{
111+
overrideUpdateRequest := kibana.AgentPolicyUpdateRequest{
112+
Name: runner.policyName,
113+
Namespace: "default",
114+
Overrides: map[string]interface{}{
116115
"agent": map[string]interface{}{
117116
"monitoring": map[string]interface{}{
118117
"http": map[string]interface{}{
@@ -125,19 +124,22 @@ func (runner *MonitoringRunner) TestMonitoringLiveness() {
125124
},
126125
}
127126

128-
raw, err := json.Marshal(override)
127+
policyResponse, err := runner.info.KibanaClient.UpdatePolicy(ctx, runner.policyID, overrideUpdateRequest)
129128
require.NoError(runner.T(), err)
130-
reader := bytes.NewBuffer(raw)
131-
overrideEndpoint := fmt.Sprintf("/api/fleet/agent_policies/%s", runner.policyID)
132-
statusCode, overrideResp, err := runner.info.KibanaClient.Request("PUT", overrideEndpoint, nil, nil, reader)
133-
require.NoError(runner.T(), err)
134-
require.Equal(runner.T(), http.StatusOK, statusCode, "non-200 status code; got response: %s", string(overrideResp))
129+
130+
// verify the new policy revision was applied
131+
require.Eventually(
132+
runner.T(),
133+
tools.IsPolicyRevision(ctx, runner.T(), runner.info.KibanaClient, runner.agentID, policyResponse.Revision),
134+
5*time.Minute, time.Second)
135135

136136
runner.AllComponentsHealthy(ctx)
137137

138138
// check to make sure that we now have a liveness probe response
139-
req, err = http.NewRequestWithContext(ctx, "GET", endpoint, nil)
140-
require.NoError(runner.T(), err)
139+
require.EventuallyWithT(runner.T(), func(collect *assert.CollectT) {
140+
req, err = http.NewRequestWithContext(ctx, "GET", endpoint, nil)
141+
require.NoError(collect, err)
142+
}, time.Minute, time.Second)
141143

142144
// second check: the /liveness endpoint should now be responding
143145
runner.CheckResponse(ctx, endpoint)

0 commit comments

Comments
 (0)