From 95c409be9eef7e2f9c5b9d659a81a7107e3ae4b1 Mon Sep 17 00:00:00 2001
From: Yang Wang <yang.wang@elastic.co>
Date: Mon, 14 Jul 2025 12:04:38 +1000
Subject: [PATCH 1/3] [Test] Wait on master node for shard started

The shard started may not be visible on the master node if the wait is
on a data node. In that case, the DiskThreshold monitor may use stale
cluster state for releasing read-only blocks. This PR fixes it by
waiting on the master node, which is the behaviour before #129872.

Resolves: #131146
---
 .../allocation/DiskThresholdMonitorIT.java    | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
index a4533a674fe70..1a37cd4e2fa00 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
@@ -13,6 +13,7 @@
 import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
 import org.elasticsearch.cluster.DiskUsageIntegTestCase;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.cluster.metadata.ProjectId;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.ShardRoutingState;
@@ -86,20 +87,12 @@ public void testFloodStageExceeded() throws Exception {
         // Verify that we can still move shards around even while blocked
         final String newDataNodeName = internalCluster().startDataOnlyNode();
         final String newDataNodeId = clusterAdmin().prepareNodesInfo(newDataNodeName).get().getNodes().get(0).getNode().getId();
-        assertBusy(() -> {
-            final ShardRouting primaryShard = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT)
-                .clear()
-                .setRoutingTable(true)
-                .setNodes(true)
-                .setIndices(indexName)
-                .get()
-                .getState()
-                .routingTable()
-                .index(indexName)
-                .shard(0)
-                .primaryShard();
-            assertThat(primaryShard.state(), equalTo(ShardRoutingState.STARTED));
-            assertThat(primaryShard.currentNodeId(), equalTo(newDataNodeId));
+        awaitClusterState(state -> {
+            final ShardRouting primaryShard = state.routingTable(ProjectId.DEFAULT).index(indexName).shard(0).primaryShard();
+            if (primaryShard.state() != ShardRoutingState.STARTED) {
+                return false;
+            }
+            return newDataNodeId.equals(primaryShard.currentNodeId());
         });
 
         // Verify that the block is removed once the shard migration is complete

From acd89698c3ad2fd60370f765edcb195c2561773c Mon Sep 17 00:00:00 2001
From: Yang Wang <ywangd@gmail.com>
Date: Mon, 14 Jul 2025 17:33:41 +1000
Subject: [PATCH 2/3] Update
 server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java

Co-authored-by: David Turner <david.turner@elastic.co>
---
 .../cluster/routing/allocation/DiskThresholdMonitorIT.java   | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
index 1a37cd4e2fa00..bb19c9d477a45 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java
@@ -89,10 +89,7 @@ public void testFloodStageExceeded() throws Exception {
         final String newDataNodeId = clusterAdmin().prepareNodesInfo(newDataNodeName).get().getNodes().get(0).getNode().getId();
         awaitClusterState(state -> {
             final ShardRouting primaryShard = state.routingTable(ProjectId.DEFAULT).index(indexName).shard(0).primaryShard();
-            if (primaryShard.state() != ShardRoutingState.STARTED) {
-                return false;
-            }
-            return newDataNodeId.equals(primaryShard.currentNodeId());
+            return primaryShard.state() == ShardRoutingState.STARTED && newDataNodeId.equals(primaryShard.currentNodeId());
         });
 
         // Verify that the block is removed once the shard migration is complete

From b713053f0f0eda01c11464b2d0252b665d86802d Mon Sep 17 00:00:00 2001
From: Yang Wang <yang.wang@elastic.co>
Date: Tue, 15 Jul 2025 09:05:24 +1000
Subject: [PATCH 3/3] unmute

---
 muted-tests.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/muted-tests.yml b/muted-tests.yml
index 5487810e2fe1e..6b52f2456dc1e 100644
--- a/muted-tests.yml
+++ b/muted-tests.yml
@@ -535,9 +535,6 @@ tests:
 - class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT
   method: test {p0=field_caps/40_time_series/Get simple time series field caps}
   issue: https://github.com/elastic/elasticsearch/issues/131225
-- class: org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitorIT
-  method: testFloodStageExceeded
-  issue: https://github.com/elastic/elasticsearch/issues/131146
 - class: org.elasticsearch.packaging.test.DockerTests
   method: test090SecurityCliPackaging
   issue: https://github.com/elastic/elasticsearch/issues/131107