Add temporary fix for Fleet policy issue (#17682)

ericywl · web-flow · commit 490f4199d465 · 2025-07-17T03:12:38.000Z
diff --git a/integrationservertest/internal/gen/generator.go b/integrationservertest/internal/gen/generator.go
@@ -19,7 +19,9 @@ package gen
 
 import (
 	"context"
+	"crypto/rand"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -54,14 +56,64 @@ func New(url, apikey string, kbc *kibana.Client, logger *zap.Logger) *Generator
 	}
 }
 
-func (g *Generator) waitForAPMToBePublishReady(ctx context.Context, maxWaitDuration time.Duration) error {
+// RunBlockingWait runs the underlying generator in blocking mode and waits for all in-flight
+// data to be flushed before proceeding. This allows the caller to ensure than 1m aggregation
+// metrics are ingested immediately after raw data ingestion, without variable delays.
+// This may lead to data loss if the final flush takes more than 30s, which may happen if the
+// quantity of data ingested with runBlocking gets too big. The current quantity does not
+// trigger this behavior.
+func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, integrations bool) error {
+	g.logger.Info("wait for apm server to be ready")
+	if err := g.waitForAPMToBePublishReady(ctx); err != nil {
+		// If the APM server is not ready, we likely ran into an issue.
+		// For example, see https://github.com/elastic/apm-server/issues/17605.
+		// We can try to temporarily resolve it by re-applying the Elastic APM policy,
+		// and wait again.
+		//
+		// NOTE: This retry only works if there is integrations server, otherwise
+		//       simply do nothing.
+		if !integrations {
+			return fmt.Errorf("failed to wait for apm server: %w", err)
+		}
+		if err = g.reapplyAPMPolicy(ctx, version); err != nil {
+			return fmt.Errorf("failed to re-apply apm policy: %w", err)
+		}
+		if err = g.waitForAPMToBePublishReady(ctx); err != nil {
+			return fmt.Errorf("failed to wait for apm server: %w", err)
+		}
+	}
+
+	g.logger.Info("ingest data")
+	if err := g.runBlocking(ctx, version); err != nil {
+		return fmt.Errorf("cannot run generator: %w", err)
+	}
+
+	// With Fleet managed APM server, we can trigger metrics flush.
+	if integrations {
+		g.logger.Info("flush apm metrics")
+		if err := g.flushAPMMetrics(ctx, version); err != nil {
+			return fmt.Errorf("cannot flush apm metrics: %w", err)
+		}
+		return nil
+	}
+
+	// With standalone, we don't have Fleet, so simply just wait for some arbitrary time.
+	time.Sleep(180 * time.Second)
+	return nil
+}
+
+// waitForAPMToBePublishReady waits for APM server to be publish-ready by querying the server.
+func (g *Generator) waitForAPMToBePublishReady(ctx context.Context) error {
+	maxWaitDuration := 60 * time.Second
 	timer := time.NewTimer(maxWaitDuration)
 	defer timer.Stop()
 
 	for {
 		select {
+		case <-ctx.Done():
+			return errors.New("apm server not ready but context done")
 		case <-timer.C:
-			return fmt.Errorf("apm server not yet ready after %s", maxWaitDuration)
+			return fmt.Errorf("apm server not ready after %s", maxWaitDuration)
 		default:
 			info, err := queryAPMInfo(ctx, g.apmServerURL, g.apmAPIKey)
 			if err != nil {
@@ -72,15 +124,14 @@ func (g *Generator) waitForAPMToBePublishReady(ctx context.Context, maxWaitDurat
 				return nil
 			}
 
-			time.Sleep(1 * time.Second)
+			time.Sleep(10 * time.Second)
 		}
 	}
 }
 
 // runBlocking runs the underlying generator in blocking mode.
 func (g *Generator) runBlocking(ctx context.Context, version ech.Version) error {
 	eventRate := "1000/s"
-
 	cfg := telemetrygen.DefaultConfig()
 	cfg.APIKey = g.apmAPIKey
 	cfg.TargetStackVersion = supportedstacks.TargetStackVersionLatest
@@ -104,53 +155,31 @@ func (g *Generator) runBlocking(ctx context.Context, version ech.Version) error
 		return fmt.Errorf("cannot create telemetrygen generator: %w", err)
 	}
 
-	g.logger.Info("wait for apm server to be ready")
-	if err = g.waitForAPMToBePublishReady(ctx, 30*time.Second); err != nil {
-		return err
-	}
-
-	g.logger.Info("ingest data")
 	gen.Logger = g.logger
 	return gen.RunBlocking(ctx)
 }
 
-// RunBlockingWait runs the underlying generator in blocking mode and waits for all in-flight
-// data to be flushed before proceeding. This allows the caller to ensure than 1m aggregation
-// metrics are ingested immediately after raw data ingestion, without variable delays.
-// This may lead to data loss if the final flush takes more than 30s, which may happen if the
-// quantity of data ingested with runBlocking gets too big. The current quantity does not
-// trigger this behavior.
-func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, integrations bool) error {
-	if err := g.runBlocking(ctx, version); err != nil {
-		return fmt.Errorf("cannot run generator: %w", err)
-	}
+func (g *Generator) reapplyAPMPolicy(ctx context.Context, version ech.Version) error {
+	policyID := "elastic-cloud-apm"
+	description := fmt.Sprintf("%s %s", version, rand.Text()[5:])
 
-	// With Fleet managed APM server, we can trigger metrics flush.
-	if integrations {
-		if err := flushAPMMetrics(ctx, g.kbc, version); err != nil {
-			return fmt.Errorf("cannot flush apm metrics: %w", err)
-		}
-		return nil
+	if err := g.kbc.UpdatePackagePolicyDescriptionByID(ctx, policyID, version, description); err != nil {
+		return fmt.Errorf(
+			"cannot update %s package policy description: %w",
+			policyID, err,
+		)
 	}
 
-	// With standalone, we don't have Fleet, so simply just wait for some arbitrary time.
-	time.Sleep(180 * time.Second)
 	return nil
 }
 
 // flushAPMMetrics sends an update to the Fleet APM package policy in order
 // to trigger the flushing of in-flight APM metrics.
-func flushAPMMetrics(ctx context.Context, kbc *kibana.Client, version ech.Version) error {
-	policyID := "elastic-cloud-apm"
-	description := fmt.Sprintf("Integration server test %s", version)
-
-	// Sending an update with modifying the description is enough to trigger
-	// final aggregations in APM Server and flush of in-flight metrics.
-	if err := kbc.UpdatePackagePolicyDescriptionByID(ctx, policyID, version, description); err != nil {
-		return fmt.Errorf(
-			"cannot update %s package policy description to flush aggregation metrics: %w",
-			policyID, err,
-		)
+func (g *Generator) flushAPMMetrics(ctx context.Context, version ech.Version) error {
+	// Re-applying the Elastic APM policy is enough to trigger final aggregations
+	// in APM Server and flush of in-flight metrics.
+	if err := g.reapplyAPMPolicy(ctx, version); err != nil {
+		return err
 	}
 
 	// APM Server needs some time to flush all metrics, and we don't have any
diff --git a/integrationservertest/steps.go b/integrationservertest/steps.go
@@ -166,7 +166,6 @@ type ingestStep struct {
 }
 
 func (i ingestStep) Step(t *testing.T, ctx context.Context, e *testStepEnv) {
-
 	if e.currentVersion().Major < 8 {
 		t.Fatal("ingest step should only be used for versions >= 8.0")
 	}

Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,6 @@ type ingestStep struct {`
`166`	`166`	`}`
`167`	`167`
`168`	`168`	`func (i ingestStep) Step(t testing.T, ctx context.Context, e testStepEnv) {`
`169`		`-`
`170`	`169`	`if e.currentVersion().Major < 8 {`
`171`	`170`	`t.Fatal("ingest step should only be used for versions >= 8.0")`
`172`	`171`	`}`