Fork to WRITE thread when failing shard (#84606)

DaveCTurner · DaveCTurner · commit af7d32ce2c4d · 2022-03-03T10:49:18.000Z
If a replication operation fails then the primary will try and fail the replica. If this operation also fails (i.e. another shard copy has been promoted to primary) then the primary must fail itself. Today it does this on a transport thread. This is a problem because failing the shard needs to acquire a lock which may be held by another operation that is performing some potentially-long-running IO. With this commit we add an assertion that the engine is never failed on a transport thread, and adjust `ReplicationOperation` to fork the call to `failShard` to the `WRITE` threadpool. Without the change to `ReplicationOperation` the assertion is tripped by `testAckedIndexing`. Closes #84602
diff --git a/docs/changelog/84606.yaml b/docs/changelog/84606.yaml
@@ -0,0 +1,6 @@
+pr: 84606
+summary: Fork to WRITE thread when failing shard
+area: Engine
+type: bug
+issues:
+ - 84602
diff --git a/server/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java b/server/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java
@@ -10,7 +10,6 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.apache.lucene.store.AlreadyClosedException;
-import org.elasticsearch.Assertions;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
@@ -23,6 +22,7 @@
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.common.breaker.CircuitBreakingException;
 import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.TimeValue;
@@ -317,26 +317,55 @@ private void updateCheckPoints(ShardRouting shard, LongSupplier localCheckpointS
     private void onNoLongerPrimary(Exception failure) {
         final Throwable cause = ExceptionsHelper.unwrapCause(failure);
         final boolean nodeIsClosing = cause instanceof NodeClosedException;
-        final String message;
         if (nodeIsClosing) {
-            message = String.format(
-                Locale.ROOT,
-                "node with primary [%s] is shutting down while failing replica shard",
-                primary.routingEntry()
-            );
             // We prefer not to fail the primary to avoid unnecessary warning log
             // when the node with the primary shard is gracefully shutting down.
+            finishAsFailed(
+                new RetryOnPrimaryException(
+                    primary.routingEntry().shardId(),
+                    String.format(
+                        Locale.ROOT,
+                        "node with primary [%s] is shutting down while failing replica shard",
+                        primary.routingEntry()
+                    ),
+                    failure
+                )
+            );
         } else {
-            if (Assertions.ENABLED) {
-                if (failure instanceof ShardStateAction.NoLongerPrimaryShardException == false) {
-                    throw new AssertionError("unexpected failure", failure);
+            assert failure instanceof ShardStateAction.NoLongerPrimaryShardException : failure;
+            threadPool.executor(ThreadPool.Names.WRITE).execute(new AbstractRunnable() {
+                @Override
+                protected void doRun() {
+                    // we are no longer the primary, fail ourselves and start over
+                    final var message = String.format(
+                        Locale.ROOT,
+                        "primary shard [%s] was demoted while failing replica shard",
+                        primary.routingEntry()
+                    );
+                    primary.failShard(message, failure);
+                    finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), message, failure));
                 }
-            }
-            // we are no longer the primary, fail ourselves and start over
-            message = String.format(Locale.ROOT, "primary shard [%s] was demoted while failing replica shard", primary.routingEntry());
-            primary.failShard(message, failure);
+
+                @Override
+                public boolean isForceExecution() {
+                    return true;
+                }
+
+                @Override
+                public void onFailure(Exception e) {
+                    e.addSuppressed(failure);
+                    assert false : e;
+                    logger.error(new ParameterizedMessage("unexpected failure while failing primary [{}]", primary.routingEntry()), e);
+                    finishAsFailed(
+                        new RetryOnPrimaryException(
+                            primary.routingEntry().shardId(),
+                            String.format(Locale.ROOT, "unexpected failure while failing primary [%s]", primary.routingEntry()),
+                            e
+                        )
+                    );
+                }
+            });
         }
-        finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), message, failure));
     }
 
     /**
diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java
@@ -61,6 +61,7 @@
 import org.elasticsearch.index.translog.Translog;
 import org.elasticsearch.index.translog.TranslogStats;
 import org.elasticsearch.search.suggest.completion.CompletionStats;
+import org.elasticsearch.transport.Transports;
 
 import java.io.Closeable;
 import java.io.IOException;
@@ -1099,6 +1100,7 @@ private void maybeDie(final String maybeMessage, final Throwable maybeFatal) {
      * The underlying store is marked corrupted iff failure is caused by index corruption
      */
     public void failEngine(String reason, @Nullable Exception failure) {
+        assert Transports.assertNotTransportThread("failEngine can block on IO");
         if (failure != null) {
             maybeDie(reason, failure);
         }