Do not treat replica as unassigned if new and below time threshold. (#112066)

parkertimmins · web-flow · commit b776cf6460c1 · 2024-08-28T14:18:50.000-05:00
Changes the way we calculate if all replicas are unassigned when primary is recently created.
This change will only be used in serverless, not in stateful. When a primary is new, if the primary 
is active, but the replica is unassigned for less than a buffer time period, do not treat is as unassigned. 
Control time period through health.shards_availability.replica_unassigned_buffer_time setting.
diff --git a/docs/changelog/112066.yaml b/docs/changelog/112066.yaml
@@ -0,0 +1,6 @@
+pr: 112066
+summary: Do not treat replica as unassigned if primary recently created and unassigned
+  time is below a threshold
+area: Health
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java
@@ -40,9 +40,11 @@
 import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.health.Diagnosis;
 import org.elasticsearch.health.HealthIndicatorDetails;
 import org.elasticsearch.health.HealthIndicatorImpact;
@@ -56,6 +58,7 @@
 import org.elasticsearch.snapshots.SearchableSnapshotsSettings;
 import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
 
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -108,11 +111,29 @@ public class ShardsAvailabilityHealthIndicatorService implements HealthIndicator
 
     private static final String DATA_TIER_ALLOCATION_DECIDER_NAME = "data_tier";
 
+    /**
+     * Changes the behavior of isNewlyCreatedAndInitializingReplica so that the
+     * shard_availability health indicator returns YELLOW if a primary
+     * is STARTED, but a replica is still INITIALIZING and the replica has been
+     * unassigned for less than the value of this setting. This function is
+     * only used in serverless, so this setting has no effect in stateless.
+     */
+    public static final Setting<TimeValue> REPLICA_UNASSIGNED_BUFFER_TIME = Setting.timeSetting(
+        "health.shards_availability.replica_unassigned_buffer_time",
+        TimeValue.timeValueSeconds(3),
+        TimeValue.timeValueSeconds(0),
+        TimeValue.timeValueSeconds(20),
+        Setting.Property.NodeScope,
+        Setting.Property.Dynamic
+    );
+
     private final ClusterService clusterService;
     private final AllocationService allocationService;
 
     private final SystemIndices systemIndices;
 
+    private volatile TimeValue replicaUnassignedBufferTime = TimeValue.timeValueSeconds(0);
+
     public ShardsAvailabilityHealthIndicatorService(
         ClusterService clusterService,
         AllocationService allocationService,
@@ -121,6 +142,11 @@ public ShardsAvailabilityHealthIndicatorService(
         this.clusterService = clusterService;
         this.allocationService = allocationService;
         this.systemIndices = systemIndices;
+        clusterService.getClusterSettings().addSettingsUpdateConsumer(REPLICA_UNASSIGNED_BUFFER_TIME, this::setReplicaUnassignedBufferTime);
+    }
+
+    private void setReplicaUnassignedBufferTime(TimeValue replicaUnassignedBufferTime) {
+        this.replicaUnassignedBufferTime = replicaUnassignedBufferTime;
     }
 
     @Override
@@ -144,7 +170,7 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
         var state = clusterService.state();
         var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
         var status = createNewStatus(state.getMetadata());
-        updateShardAllocationStatus(status, state, shutdown, verbose);
+        updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
         return createIndicator(
             status.getStatus(),
             status.getSymptom(),
@@ -158,14 +184,15 @@ static void updateShardAllocationStatus(
         ShardAllocationStatus status,
         ClusterState state,
         NodesShutdownMetadata shutdown,
-        boolean verbose
+        boolean verbose,
+        TimeValue replicaUnassignedBufferTime
     ) {
         for (IndexRoutingTable indexShardRouting : state.routingTable()) {
             for (int i = 0; i < indexShardRouting.size(); i++) {
                 IndexShardRoutingTable shardRouting = indexShardRouting.shard(i);
                 status.addPrimary(shardRouting.primaryShard(), state, shutdown, verbose);
                 for (ShardRouting replicaShard : shardRouting.replicaShards()) {
-                    status.addReplica(replicaShard, state, shutdown, verbose);
+                    status.addReplica(replicaShard, state, shutdown, verbose, replicaUnassignedBufferTime);
                 }
             }
         }
@@ -438,11 +465,18 @@ public class ShardAllocationCounts {
         public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
         final Map<Diagnosis.Definition, Set<String>> diagnosisDefinitions = new HashMap<>();
 
-        public void increment(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {
+        public void increment(
+            ShardRouting routing,
+            ClusterState state,
+            NodesShutdownMetadata shutdowns,
+            boolean verbose,
+            TimeValue replicaUnassignedBufferTime
+        ) {
             boolean isNew = isUnassignedDueToNewInitialization(routing, state);
             boolean isRestarting = isUnassignedDueToTimelyRestart(routing, shutdowns);
+            long replicaUnassignedCutoffTime = Instant.now().toEpochMilli() - replicaUnassignedBufferTime.millis();
             boolean allUnavailable = areAllShardsOfThisTypeUnavailable(routing, state)
-                && isNewlyCreatedAndInitializingReplica(routing, state) == false;
+                && isNewlyCreatedAndInitializingReplica(routing, state, replicaUnassignedCutoffTime) == false;
             if (allUnavailable) {
                 indicesWithAllShardsUnavailable.add(routing.getIndexName());
             }
@@ -520,18 +554,23 @@ boolean areAllShardsOfThisTypeUnavailable(ShardRouting routing, ClusterState sta
      * (a newly created index having unassigned replicas for example), we don't want the cluster
      * to turn "unhealthy" for the tiny amount of time before the shards are allocated.
      */
-    static boolean isNewlyCreatedAndInitializingReplica(ShardRouting routing, ClusterState state) {
+    static boolean isNewlyCreatedAndInitializingReplica(ShardRouting routing, ClusterState state, long replicaUnassignedCutoffTime) {
         if (routing.active()) {
             return false;
         }
         if (routing.primary()) {
             return false;
         }
         ShardRouting primary = state.routingTable().shardRoutingTable(routing.shardId()).primaryShard();
-        if (primary.active()) {
-            return false;
+        if (primary.active() == false) {
+            return ClusterShardHealth.getInactivePrimaryHealth(primary) == ClusterHealthStatus.YELLOW;
         }
-        return ClusterShardHealth.getInactivePrimaryHealth(primary) == ClusterHealthStatus.YELLOW;
+
+        Optional<UnassignedInfo> ui = Optional.ofNullable(routing.unassignedInfo());
+        return ui.filter(info -> info.failedAllocations() == 0)
+            .filter(info -> info.lastAllocationStatus() != UnassignedInfo.AllocationStatus.DECIDERS_NO)
+            .filter(info -> info.unassignedTimeMillis() > replicaUnassignedCutoffTime)
+            .isPresent();
     }
 
     private static boolean isUnassignedDueToTimelyRestart(ShardRouting routing, NodesShutdownMetadata shutdowns) {
@@ -910,11 +949,17 @@ public ShardAllocationStatus(Metadata clusterMetadata) {
         }
 
         void addPrimary(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {
-            primaries.increment(routing, state, shutdowns, verbose);
+            primaries.increment(routing, state, shutdowns, verbose, TimeValue.MINUS_ONE);
         }
 
-        void addReplica(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {
-            replicas.increment(routing, state, shutdowns, verbose);
+        void addReplica(
+            ShardRouting routing,
+            ClusterState state,
+            NodesShutdownMetadata shutdowns,
+            boolean verbose,
+            TimeValue replicaUnassignedBufferTime
+        ) {
+            replicas.increment(routing, state, shutdowns, verbose, replicaUnassignedBufferTime);
         }
 
         void updateSearchableSnapshotsOfAvailableIndices() {
diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -55,6 +55,7 @@
 import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider;
+import org.elasticsearch.cluster.routing.allocation.shards.ShardsAvailabilityHealthIndicatorService;
 import org.elasticsearch.cluster.service.ClusterApplierService;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.cluster.service.MasterService;
@@ -598,6 +599,7 @@ public void apply(Settings value, Settings current, Settings previous) {
         MergePolicyConfig.DEFAULT_MAX_TIME_BASED_MERGED_SEGMENT_SETTING,
         TransportService.ENABLE_STACK_OVERFLOW_AVOIDANCE,
         DataStreamGlobalRetentionSettings.DATA_STREAMS_DEFAULT_RETENTION_SETTING,
-        DataStreamGlobalRetentionSettings.DATA_STREAMS_MAX_RETENTION_SETTING
+        DataStreamGlobalRetentionSettings.DATA_STREAMS_MAX_RETENTION_SETTING,
+        ShardsAvailabilityHealthIndicatorService.REPLICA_UNASSIGNED_BUFFER_TIME
     );
 }
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityActionGuideTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityActionGuideTests.java
@@ -10,6 +10,7 @@
 
 import org.elasticsearch.cluster.routing.allocation.shards.ShardsAvailabilityHealthIndicatorService;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.indices.SystemIndices;
 import org.elasticsearch.test.ESTestCase;
 
@@ -33,14 +34,17 @@
 import static org.elasticsearch.cluster.routing.allocation.shards.ShardsAvailabilityHealthIndicatorService.TIER_CAPACITY_ACTION_GUIDE;
 import static org.hamcrest.Matchers.is;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 public class ShardsAvailabilityActionGuideTests extends ESTestCase {
 
-    private final ShardsAvailabilityHealthIndicatorService service = new ShardsAvailabilityHealthIndicatorService(
-        mock(ClusterService.class),
-        mock(AllocationService.class),
-        mock(SystemIndices.class)
-    );
+    private final ShardsAvailabilityHealthIndicatorService service;
+
+    public ShardsAvailabilityActionGuideTests() {
+        ClusterService clusterService = mock(ClusterService.class);
+        when(clusterService.getClusterSettings()).thenReturn(ClusterSettings.createBuiltInClusterSettings());
+        service = new ShardsAvailabilityHealthIndicatorService(clusterService, mock(AllocationService.class), mock(SystemIndices.class));
+    }
 
     public void testRestoreFromSnapshotAction() {
         assertThat(ACTION_RESTORE_FROM_SNAPSHOT.helpURL(), is(RESTORE_FROM_SNAPSHOT_ACTION_GUIDE));
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java