@@ -1400,7 +1400,11 @@ public CommitStats commitStats() {
14001400     * @throws AlreadyClosedException if shard is closed 
14011401     */ 
14021402    public  SeqNoStats  seqNoStats () {
1403-         return  getEngine ().getSeqNoStats (replicationTracker .getGlobalCheckpoint ());
1403+         return  seqNoStats (false );
1404+     }
1405+ 
1406+     public  SeqNoStats  seqNoStats (boolean  skipAssertions ) {
1407+         return  getEngine (skipAssertions ).getSeqNoStats (replicationTracker .getGlobalCheckpoint ());
14041408    }
14051409
14061410    public  IndexingStats  indexingStats () {
@@ -1463,11 +1467,12 @@ public StoreStats storeStats() {
14631467    }
14641468
14651469    public  MergeStats  mergeStats () {
1466-         final  Engine  engine  = getEngineOrNull ();
1467-         if  (engine  == null ) {
1468-             return  new  MergeStats ();
1469-         }
1470-         return  engine .getMergeStats ();
1470+         return  tryWithEngineOrNull (engine  -> {
1471+             if  (engine  == null ) {
1472+                 return  new  MergeStats ();
1473+             }
1474+             return  engine .getMergeStats ();
1475+         });
14711476    }
14721477
14731478    public  SegmentsStats  segmentStats (boolean  includeSegmentFileSizes , boolean  includeUnloadedSegments ) {
@@ -1707,14 +1712,14 @@ public Engine.SearcherSupplier acquireSearcherSupplier() {
17071712    public  Engine .SearcherSupplier  acquireSearcherSupplier (Engine .SearcherScope  scope ) {
17081713        readAllowed ();
17091714        markSearcherAccessed ();
1710-         final  Engine  engine  = getEngine (); 
1715+         final  Engine  engine  = getEngine (true );  // should primarily happen on search nodes 
17111716        return  engine .acquireSearcherSupplier (this ::wrapSearcher , scope );
17121717    }
17131718
17141719    public  Engine .Searcher  acquireSearcher (String  source ) {
17151720        readAllowed ();
17161721        markSearcherAccessed ();
1717-         final  Engine  engine  = getEngine (); 
1722+         final  Engine  engine  = getEngine (true );  // should primarily happen on search nodes 
17181723        return  engine .acquireSearcher (source , Engine .SearcherScope .EXTERNAL , this ::wrapSearcher );
17191724    }
17201725
@@ -2661,7 +2666,35 @@ boolean shouldRollTranslogGeneration() {
26612666    public  void  onSettingsChanged () {
26622667        engineResetLock .readLock ().lock ();
26632668        try  {
2664-             var  engine  = getCurrentEngine (true );
2669+             // TODO this may be called by a cluster state update thread and we need to consider whether it is an issue 
2670+             // java.lang.AssertionError: Expected current thread 
2671+             // [Thread[#97,elasticsearch[node_t3][clusterApplierService#updateTask][T#1],5,TGRP-StatelessIT]] to not be the cluster state 
2672+             // update thread. Reason: [method IndexShard#getCurrentEngine (or one of its variant) can block] 
2673+             // at __randomizedtesting.SeedInfo.seed([6244501B70969C37]:0) 
2674+             // at org.elasticsearch.cluster.service.ClusterApplierService.assertNotClusterStateUpdateThread(ClusterApplierService.java:386) 
2675+             // at org.elasticsearch.index.shard.IndexShard.assertCurrentThreadWithEngine(IndexShard.java:3550) 
2676+             // at org.elasticsearch.index.shard.IndexShard.getCurrentEngine(IndexShard.java:3430) 
2677+             // at org.elasticsearch.index.shard.IndexShard.onSettingsChanged(IndexShard.java:2673) 
2678+             // at org.elasticsearch.index.IndexService.updateMetadata(IndexService.java:1013) 
2679+             // at org.elasticsearch.indices.cluster.IndicesClusterStateService.updateIndices(IndicesClusterStateService.java:662) 
2680+             // at org.elasticsearch.indices.cluster.IndicesClusterStateService.doApplyClusterState(IndicesClusterStateService.java:322) 
2681+             // at org.elasticsearch.indices.cluster.IndicesClusterStateService.applyClusterState(IndicesClusterStateService.java:278) 
2682+             // at org.elasticsearch.cluster.service.ClusterApplierService.callClusterStateAppliers(ClusterApplierService.java:572) 
2683+             // at org.elasticsearch.cluster.service.ClusterApplierService.callClusterStateAppliers(ClusterApplierService.java:558) 
2684+             // at org.elasticsearch.cluster.service.ClusterApplierService.applyChanges(ClusterApplierService.java:531) 
2685+             // at org.elasticsearch.cluster.service.ClusterApplierService.runTask(ClusterApplierService.java:460) 
2686+             // at org.elasticsearch.cluster.service.ClusterApplierService$UpdateTask.run(ClusterApplierService.java:159) 
2687+             // at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:1000) 
2688+             // at 
2689+             // org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean 
2690+             // (PrioritizedEsThreadPoolExecutor.java:218) 
2691+             // at 
2692+             // org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run 
2693+             // (PrioritizedEsThreadPoolExecutor.java:184) 
2694+             // at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1095) 
2695+             // at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:619) 
2696+             // at java.base/java.lang.Thread.run(Thread.java:1447) 
2697+             var  engine  = getCurrentEngine (true , true );
26652698            if  (engine  != null ) {
26662699                engine .onSettingsChanged ();
26672700            }
@@ -2674,15 +2707,18 @@ public void onSettingsChanged() {
26742707     * Acquires a lock on Lucene soft-deleted documents to prevent them from being trimmed 
26752708     */ 
26762709    public  Closeable  acquireHistoryRetentionLock () {
2677-         return  getEngine ().acquireHistoryRetentionLock ();
2710+         // Skip assertions since this is called either during recovery, or under a primary permit (thus during ingestion) which is not 
2711+         // something that can happen concurrently with hollowing (primary relocation holds permits) or unhollowing (we block ingestion). 
2712+         return  getEngine (true ).acquireHistoryRetentionLock ();
26782713    }
26792714
26802715    /** 
26812716     * Checks if we have a completed history of operations since the given starting seqno (inclusive). 
26822717     * This method should be called after acquiring the retention lock; See {@link #acquireHistoryRetentionLock()} 
26832718     */ 
26842719    public  boolean  hasCompleteHistoryOperations (String  reason , long  startingSeqNo ) {
2685-         return  getEngine ().hasCompleteOperationHistory (reason , startingSeqNo );
2720+         // Skip assertions since this is called during recovery. 
2721+         return  getEngine (true ).hasCompleteOperationHistory (reason , startingSeqNo );
26862722    }
26872723
26882724    /** 
@@ -2691,7 +2727,8 @@ public boolean hasCompleteHistoryOperations(String reason, long startingSeqNo) {
26912727     * @return the minimum retained sequence number 
26922728     */ 
26932729    public  long  getMinRetainedSeqNo () {
2694-         return  getEngine ().getMinRetainedSeqNo ();
2730+         // Skip assertions since this is called under a primary permit 
2731+         return  getEngine (true ).getMinRetainedSeqNo ();
26952732    }
26962733
26972734    /** 
@@ -3045,7 +3082,11 @@ public void markAllocationIdAsInSync(final String allocationId, final long local
30453082     * @return the local checkpoint 
30463083     */ 
30473084    public  long  getLocalCheckpoint () {
3048-         return  getEngine ().getPersistedLocalCheckpoint ();
3085+         return  getLocalCheckpoint (false );
3086+     }
3087+ 
3088+     public  long  getLocalCheckpoint (boolean  skipAssertions ) {
3089+         return  getEngine (skipAssertions ).getPersistedLocalCheckpoint ();
30493090    }
30503091
30513092    /** 
@@ -3087,7 +3128,10 @@ public void maybeSyncGlobalCheckpoint(final String reason) {
30873128        }
30883129        assert  assertPrimaryMode ();
30893130        // only sync if there are no operations in flight, or when using async durability 
3090-         final  SeqNoStats  stats  = getEngine ().getSeqNoStats (replicationTracker .getGlobalCheckpoint ());
3131+         // Skip assertions because this is called either from an async thread (so OK to block) or from the post operation of a replication 
3132+         // action under a primary permit -- which would not be concurrent with either hollowing/unhollowing (since ingestion is blocked 
3133+         // before hollowing, due to the primary relocation, and due to our own blocker during unhollowing). 
3134+         final  SeqNoStats  stats  = getEngine (true ).getSeqNoStats (replicationTracker .getGlobalCheckpoint ());
30913135        final  boolean  asyncDurability  = indexSettings ().getTranslogDurability () == Translog .Durability .ASYNC ;
30923136        if  (stats .getMaxSeqNo () == stats .getGlobalCheckpoint () || asyncDurability ) {
30933137            final  var  trackedGlobalCheckpointsNeedSync  = replicationTracker .trackedGlobalCheckpointsNeedSync ();
@@ -3150,7 +3194,7 @@ public PendingReplicationActions getPendingReplicationActions() {
31503194     */ 
31513195    public  void  updateGlobalCheckpointOnReplica (final  long  globalCheckpoint , final  String  reason ) {
31523196        assert  assertReplicationTarget ();
3153-         final  long  localCheckpoint  = getLocalCheckpoint ();
3197+         final  long  localCheckpoint  = getLocalCheckpoint (true );
31543198        if  (globalCheckpoint  > localCheckpoint ) {
31553199            /* 
31563200             * This can happen during recovery when the shard has started its engine but recovery is not finalized and is receiving global 
@@ -3390,9 +3434,14 @@ private void doCheckIndex() throws IOException {
33903434     */ 
33913435    @ Deprecated 
33923436    Engine  getEngine () {
3437+         return  getEngine (false );
3438+     }
3439+ 
3440+     @ Deprecated 
3441+     Engine  getEngine (boolean  skipAssertions ) {
33933442        engineResetLock .readLock ().lock ();
33943443        try  {
3395-             return  getCurrentEngine (false );
3444+             return  getCurrentEngine (false ,  skipAssertions );
33963445        } finally  {
33973446            engineResetLock .readLock ().unlock ();
33983447        }
@@ -3408,13 +3457,20 @@ Engine getEngine() {
34083457    public  Engine  getEngineOrNull () {
34093458        engineResetLock .readLock ().lock ();
34103459        try  {
3411-             return  getCurrentEngine (true );
3460+             return  getCurrentEngine (true ,  false );
34123461        } finally  {
34133462            engineResetLock .readLock ().unlock ();
34143463        }
34153464    }
34163465
3417-     private  Engine  getCurrentEngine (boolean  allowNoEngine ) {
3466+     private  Engine  getCurrentEngine (boolean  allowNoEngine , boolean  skipAssertions ) {
3467+         // We only reset a shard when it's relocating (primary relocation) or started (unhollowing) 
3468+         boolean  shardRoutingEngineResettable  = shardRouting .started () || shardRouting .relocating ();
3469+         assert  skipAssertions 
3470+             || shardRouting .primary () == false  // exclude non-primary shards. We only reset primary shards. 
3471+             || shardRoutingEngineResettable  == false 
3472+             || state () != IndexShardState .STARTED  // exclude getting the engine when not started. We only reset started shards. 
3473+             || assertCurrentThreadWithEngine ("method IndexShard#getCurrentEngine (or one of its variant) can block" );
34183474        assert  engineResetLock .isReadLockedByCurrentThread () || engineResetLock .isWriteLockedByCurrentThread () /* for resets */ ;
34193475        var  engine  = currentEngine .get ();
34203476        if  (engine  == null  && allowNoEngine  == false ) {
@@ -3482,12 +3538,12 @@ public <R> R withEngine(Function<Engine, R> operation) {
34823538     * @throws              AlreadyClosedException if the current engine instance is {@code null}. 
34833539     */ 
34843540    public  <R , E  extends  Exception > R  withEngineException (CheckedFunction <Engine , R , E > operation ) throws  E  {
3485-         assert  assertCurrentThreadWithEngine ();
3541+         assert  assertCurrentThreadWithEngine ("method IndexShard#withEngineException (or one of its variant) can block" );
34863542        assert  operation  != null ;
34873543
34883544        engineResetLock .readLock ().lock ();
34893545        try  {
3490-             var  engine  = getCurrentEngine (false );
3546+             var  engine  = getCurrentEngine (false ,  false );
34913547            return  operation .apply (engine );
34923548        } finally  {
34933549            engineResetLock .readLock ().unlock ();
@@ -3513,7 +3569,8 @@ public <R, E extends Exception> R withEngineException(CheckedFunction<Engine, R,
35133569     */ 
35143570    private  <R > R  withEngine (Function <Engine , R > operation , boolean  allowNoEngine , boolean  blockIfResetting ) {
35153571        assert  operation  != null ;
3516-         assert  blockIfResetting  == false  || assertCurrentThreadWithEngine (); // assert current thread can block on engine resets 
3572+         // assert current thread can block on engine resets 
3573+         assert  blockIfResetting  == false  || assertCurrentThreadWithEngine ("method IndexShard#withEngine (or one of its variant) can block" );
35173574        boolean  locked  = true ;
35183575        if  (blockIfResetting ) {
35193576            engineResetLock .readLock ().lock ();
@@ -3526,7 +3583,7 @@ private <R> R withEngine(Function<Engine, R> operation, boolean allowNoEngine, b
35263583        }
35273584        if  (locked ) {
35283585            try  {
3529-                 var  engine  = getCurrentEngine (allowNoEngine );
3586+                 var  engine  = getCurrentEngine (allowNoEngine ,  blockIfResetting  ==  false );
35303587                return  operation .apply (engine );
35313588            } finally  {
35323589                engineResetLock .readLock ().unlock ();
@@ -3536,8 +3593,7 @@ private <R> R withEngine(Function<Engine, R> operation, boolean allowNoEngine, b
35363593        }
35373594    }
35383595
3539-     private  static  boolean  assertCurrentThreadWithEngine () {
3540-         var  message  = "method IndexShard#withEngine (or one of its variant) can block" ;
3596+     private  static  boolean  assertCurrentThreadWithEngine (String  message ) {
35413597        assert  ClusterApplierService .assertNotClusterStateUpdateThread (message );
35423598        assert  MasterService .assertNotMasterUpdateThread (message );
35433599        assert  Transports .assertNotTransportThread (message );
@@ -4600,7 +4656,7 @@ public void resetEngine(Consumer<Engine> postResetNewEngineConsumer) {
46004656                try  {
46014657                    engineResetLock .writeLock ().lock ();
46024658                    try  {
4603-                         var  engine  = getCurrentEngine (false );
4659+                         var  engine  = getCurrentEngine (false ,  true );
46044660                        engine .prepareForEngineReset ();
46054661                        var  newEngine  = createEngine (newEngineConfig (replicationTracker ));
46064662                        getAndSetCurrentEngine (newEngine );
0 commit comments