Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions muted-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -504,9 +504,6 @@ tests:
- class: org.elasticsearch.xpack.ml.integration.RevertModelSnapshotIT
method: testRevertModelSnapshot
issue: https://github.com/elastic/elasticsearch/issues/132733
- class: org.elasticsearch.repositories.SnapshotMetricsIT
method: testSnapshotAPMMetrics
issue: https://github.com/elastic/elasticsearch/issues/132731
- class: org.elasticsearch.index.codec.vectors.cluster.HierarchicalKMeansTests
method: testHKmeans
issue: https://github.com/elastic/elasticsearch/issues/132771
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

package org.elasticsearch.repositories;

import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
import org.elasticsearch.action.admin.indices.stats.IndexStats;
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
import org.elasticsearch.action.admin.indices.stats.ShardStats;
Expand Down Expand Up @@ -114,11 +116,12 @@ public void testSnapshotAPMMetrics() throws Exception {
blockAllDataNodes(repositoryName);
final String snapshotName = randomIdentifier();
final long beforeCreateSnapshotNanos = System.nanoTime();
final ActionFuture<CreateSnapshotResponse> snapshotFuture;
try {
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName)
snapshotFuture = clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName)
.setIndices(indexName)
.setWaitForCompletion(false)
.get();
.setWaitForCompletion(true)
.execute();

waitForBlockOnAnyDataNode(repositoryName);
collectMetrics();
Expand All @@ -132,7 +135,7 @@ public void testSnapshotAPMMetrics() throws Exception {
}

// wait for snapshot to finish to test the other metrics
awaitNumberOfSnapshotsInProgress(0);
safeGet(snapshotFuture);
Copy link
Contributor Author

@nicktindall nicktindall Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if we wait for the response to be received that should mean the metrics should be written? (note waitForCompletion=true above now)

I'm running in a loop to try and confirm that now

Copy link
Contributor Author

@nicktindall nicktindall Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This introduced a source of flakiness in the tension between the throttling and safeGet. If the setting was too high, we exceeded the safeGet timeout, if the setting was too low, we didn't see any throttling.

For this reason I've moved out the test for the throttling metrics to another test. It has more complexity to ensure we see throttling and we don't run too long.

final TimeValue snapshotElapsedTime = TimeValue.timeValueNanos(System.nanoTime() - beforeCreateSnapshotNanos);
collectMetrics();

Expand Down Expand Up @@ -237,11 +240,13 @@ public void testByStateCounts_InitAndQueuedShards() throws Exception {
blockAllDataNodes(repositoryName);

final String snapshotName = randomIdentifier();
final ActionFuture<CreateSnapshotResponse> firstSnapshotFuture;
final ActionFuture<CreateSnapshotResponse> secondSnapshotFuture;
try {
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName)
firstSnapshotFuture = clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName)
.setIndices(indexName)
.setWaitForCompletion(false)
.get();
.setWaitForCompletion(true)
.execute();

waitForBlockOnAnyDataNode(repositoryName);

Expand All @@ -252,10 +257,12 @@ public void testByStateCounts_InitAndQueuedShards() throws Exception {
assertThat(snapshotStates.get(SnapshotsInProgress.State.STARTED), equalTo(1L));

// Queue up another snapshot
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
secondSnapshotFuture = clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
.setIndices(indexName)
.setWaitForCompletion(false)
.get();
.setWaitForCompletion(true)
.execute();

awaitNumberOfSnapshotsInProgress(2);

// Should be {numShards} in QUEUED and INIT states, and 2 STARTED snapshots
shardStates = getShardStates();
Expand All @@ -268,7 +275,8 @@ public void testByStateCounts_InitAndQueuedShards() throws Exception {
}

// All statuses should return to zero when the snapshots complete
awaitNumberOfSnapshotsInProgress(0);
safeGet(firstSnapshotFuture);
safeGet(secondSnapshotFuture);
getShardStates().forEach((key, value) -> assertThat(value, equalTo(0L)));
getSnapshotStates().forEach((key, value) -> assertThat(value, equalTo(0L)));

Expand Down Expand Up @@ -309,12 +317,13 @@ public void testByStateCounts_PausedForRemovalShards() throws Exception {
blockNodeOnAnyFiles(repositoryName, nodeForRemoval);

final ClusterService clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class);
final ActionFuture<CreateSnapshotResponse> snapshotFuture;
try {
// Kick off a snapshot
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
snapshotFuture = clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
.setIndices(indexName)
.setWaitForCompletion(false)
.get();
.setWaitForCompletion(true)
.execute();

// Wait till we're blocked
waitForBlock(nodeForRemoval, repositoryName);
Expand All @@ -337,7 +346,7 @@ public void testByStateCounts_PausedForRemovalShards() throws Exception {
clearShutdownMetadata(clusterService);

// All statuses should return to zero when the snapshot completes
awaitNumberOfSnapshotsInProgress(0);
safeGet(snapshotFuture);
getShardStates().forEach((key, value) -> assertThat(value, equalTo(0L)));
getSnapshotStates().forEach((key, value) -> assertThat(value, equalTo(0L)));

Expand Down Expand Up @@ -395,10 +404,14 @@ public void testByStateCounts_WaitingShards() throws Exception {
safeAwait(handoffRequestBarrier);

// Kick off a snapshot
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
.setIndices(indexName)
.setWaitForCompletion(false)
.get();
ActionFuture<CreateSnapshotResponse> snapshotFuture = clusterAdmin().prepareCreateSnapshot(
TEST_REQUEST_TIMEOUT,
repositoryName,
randomIdentifier()
).setIndices(indexName).setWaitForCompletion(true).execute();

// Wait for the snapshot to start
awaitNumberOfSnapshotsInProgress(1);

// Wait till we see a shard in WAITING state
createSnapshotInStateListener(clusterService(), repositoryName, indexName, 1, SnapshotsInProgress.ShardState.WAITING);
Expand All @@ -413,7 +426,7 @@ public void testByStateCounts_WaitingShards() throws Exception {
safeAwait(handoffRequestBarrier);

// All statuses should return to zero when the snapshot completes
awaitNumberOfSnapshotsInProgress(0);
safeGet(snapshotFuture);
getShardStates().forEach((key, value) -> assertThat(value, equalTo(0L)));
getSnapshotStates().forEach((key, value) -> assertThat(value, equalTo(0L)));

Expand Down