Randomize and update per-node limit in stress IT

ywangd · ywangd · commit 8a95b994818d · 2025-08-11T15:51:38.000+10:00
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStressTestsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStressTestsIT.java
@@ -60,6 +60,8 @@
 import org.elasticsearch.threadpool.ScalingExecutorBuilder;
 import org.elasticsearch.threadpool.TestThreadPool;
 import org.elasticsearch.threadpool.ThreadPool;
+import org.junit.After;
+import org.junit.Before;
 
 import java.nio.file.Path;
 import java.util.ArrayList;
@@ -92,7 +94,25 @@
 @LuceneTestCase.SuppressFileSystems(value = "HandleLimitFS") // we sometimes have >2048 open files
 public class SnapshotStressTestsIT extends AbstractSnapshotIntegTestCase {
 
+    private int initialShardSnapshotPerNodeLimit;
+
+    @Before
+    public void randomInitialShardSnapshotPerNodeLimit() {
+        initialShardSnapshotPerNodeLimit = between(0, 10);
+    }
+
+    @After
+    public void clearShardSnapshotPerNodeLimitSetting() {
+        // Clear any persistent setting that may have been set during the test. The teardown process does not like it.
+        safeGet(
+            clusterAdmin().prepareUpdateSettings(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT)
+                .setPersistentSettings(Settings.builder().putNull(SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.getKey()))
+                .execute()
+        );
+    }
+
     public void testRandomActivities() throws InterruptedException {
+        logger.info("--> initial shard snapshot per node limit: [{}]", initialShardSnapshotPerNodeLimit);
         final DiscoveryNodes discoveryNodes = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT)
             .clear()
             .setNodes(true)
@@ -108,7 +128,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
         return Settings.builder()
             .put(super.nodeSettings(nodeOrdinal, otherSettings))
             .put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.ALL)
-            .put(SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.getKey(), 5) // more aggressive limit
+            .put(SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.getKey(), initialShardSnapshotPerNodeLimit)
             .build();
     }
 
@@ -341,6 +361,10 @@ public void run() throws InterruptedException {
                 startAllocationFiltering();
             }
 
+            if (randomBoolean()) {
+                startUpdateShardSnapshotPerNodeLimit();
+            }
+
             if (completedSnapshotLatch.await(30, TimeUnit.SECONDS)) {
                 logger.info("--> completed target snapshot count, finishing test");
             } else {
@@ -1394,6 +1418,43 @@ private void pollForAllocationFilterCompletion(
             }));
         }
 
+        private void startUpdateShardSnapshotPerNodeLimit() {
+            enqueueAction(() -> {
+                boolean rerun = true;
+                try (TransferableReleasables localReleasables = new TransferableReleasables()) {
+                    if (usually()) {
+                        return;
+                    }
+
+                    if (localReleasables.add(blockNodeRestarts()) == null) {
+                        return;
+                    }
+
+                    final Releasable releaseAll = localReleasables.transfer();
+
+                    final int newLimit = between(0, 10);
+                    logger.info("--> updating shard snapshot per node limit to [{}]", newLimit);
+
+                    clusterAdmin().prepareUpdateSettings(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT)
+                        .setPersistentSettings(
+                            Settings.builder().put(SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.getKey(), newLimit)
+                        )
+                        .execute(mustSucceed(response -> {
+                            assertTrue(response.isAcknowledged());
+                            logger.info("--> updated shard snapshot per node limit to [{}]", newLimit);
+                            Releasables.close(releaseAll);
+                            startUpdateShardSnapshotPerNodeLimit();
+                        }));
+
+                    rerun = false;
+                } finally {
+                    if (rerun) {
+                        startUpdateShardSnapshotPerNodeLimit();
+                    }
+                }
+            });
+        }
+
         @Nullable // if we couldn't block node restarts
         private Releasable blockNodeRestarts() {
             try (TransferableReleasables localReleasables = new TransferableReleasables()) {
diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -258,7 +258,9 @@ public SnapshotsService(
             maxConcurrentOperations = MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING.get(settings);
             clusterService.getClusterSettings()
                 .addSettingsUpdateConsumer(MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING, i -> maxConcurrentOperations = i);
-            shardSnapshotPerNodeLimit = SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.get(settings);
+            clusterService.getClusterSettings()
+                .initializeAndWatch(SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING, i -> shardSnapshotPerNodeLimit = i);
+
         }
         this.systemIndices = systemIndices;
         this.serializeProjectMetadata = serializeProjectMetadata;