@@ -761,7 +761,7 @@ private void startCloning(Repository repository, SnapshotsInProgress.Entry clone
761
761
endingSnapshots .add (targetSnapshot );
762
762
initializingClones .remove (targetSnapshot );
763
763
logger .info (() -> new ParameterizedMessage ("Failed to start snapshot clone [{}]" , cloneEntry ), e );
764
- removeFailedSnapshotFromClusterState (targetSnapshot , e , null , null );
764
+ removeFailedSnapshotFromClusterState (targetSnapshot , e , null , null , ShardGenerations . EMPTY );
765
765
};
766
766
767
767
// 1. step, load SnapshotInfo to make sure that source snapshot was successful for the indices we want to clone
@@ -1194,7 +1194,8 @@ public void onFailure(String source, Exception e) {
1194
1194
snapshot .snapshot (),
1195
1195
e ,
1196
1196
null ,
1197
- new CleanupAfterErrorListener (userCreateSnapshotListener , e )
1197
+ new CleanupAfterErrorListener (userCreateSnapshotListener , e ),
1198
+ ShardGenerations .EMPTY
1198
1199
);
1199
1200
}
1200
1201
@@ -1238,7 +1239,8 @@ public void onFailure(Exception e) {
1238
1239
snapshot .snapshot (),
1239
1240
e ,
1240
1241
null ,
1241
- new CleanupAfterErrorListener (userCreateSnapshotListener , e )
1242
+ new CleanupAfterErrorListener (userCreateSnapshotListener , e ),
1243
+ ShardGenerations .EMPTY
1242
1244
);
1243
1245
}
1244
1246
});
@@ -1876,14 +1878,21 @@ private void endSnapshot(SnapshotsInProgress.Entry entry, Metadata metadata, @Nu
1876
1878
entry .snapshot (),
1877
1879
new SnapshotException (snapshot , "Aborted on initialization" ),
1878
1880
repositoryData ,
1879
- null
1881
+ null ,
1882
+ ShardGenerations .EMPTY
1880
1883
);
1881
1884
return ;
1882
1885
}
1883
1886
if (entry .isClone () && entry .state () == State .FAILED ) {
1884
1887
logger .debug ("Removing failed snapshot clone [{}] from cluster state" , entry );
1885
1888
if (newFinalization ) {
1886
- removeFailedSnapshotFromClusterState (snapshot , new SnapshotException (snapshot , entry .failure ()), null , null );
1889
+ removeFailedSnapshotFromClusterState (
1890
+ snapshot ,
1891
+ new SnapshotException (snapshot , entry .failure ()),
1892
+ null ,
1893
+ null ,
1894
+ ShardGenerations .EMPTY
1895
+ );
1887
1896
}
1888
1897
return ;
1889
1898
}
@@ -2055,13 +2064,30 @@ private void finalizeSnapshotEntry(Snapshot snapshot, Metadata metadata, Reposit
2055
2064
completeListenersIgnoringException (endAndGetListenersToResolve (writtenSnapshotInfo .snapshot ()), result );
2056
2065
logger .info ("snapshot [{}] completed with state [{}]" , snapshot , writtenSnapshotInfo .state ());
2057
2066
runNextQueuedOperation (result .v1 (), repository , true );
2058
- }, e -> handleFinalizationFailure (e , snapshot , repositoryData ))
2067
+ },
2068
+ e -> handleFinalizationFailure (
2069
+ e ,
2070
+ snapshot ,
2071
+ repositoryData ,
2072
+ // we might have written the new root blob before failing here, so we must use the updated shardGenerations
2073
+ shardGenerations
2074
+ )
2075
+ )
2059
2076
)
2060
2077
);
2061
- }, e -> handleFinalizationFailure (e , snapshot , repositoryData ));
2078
+ },
2079
+ e -> handleFinalizationFailure (
2080
+ e ,
2081
+ snapshot ,
2082
+ repositoryData ,
2083
+ // a failure here means the root blob was not updated, but the updated shard generation blobs are all in place so we can
2084
+ // use the updated shardGenerations for all pending shard snapshots
2085
+ shardGenerations
2086
+ )
2087
+ );
2062
2088
} catch (Exception e ) {
2063
2089
assert false : new AssertionError (e );
2064
- handleFinalizationFailure (e , snapshot , repositoryData );
2090
+ handleFinalizationFailure (e , snapshot , repositoryData , ShardGenerations . EMPTY );
2065
2091
}
2066
2092
}
2067
2093
@@ -2113,7 +2139,12 @@ private List<ActionListener<Tuple<RepositoryData, SnapshotInfo>>> endAndGetListe
2113
2139
* @param snapshot snapshot that failed to finalize
2114
2140
* @param repositoryData current repository data for the snapshot's repository
2115
2141
*/
2116
- private void handleFinalizationFailure (Exception e , Snapshot snapshot , RepositoryData repositoryData ) {
2142
+ private void handleFinalizationFailure (
2143
+ Exception e ,
2144
+ Snapshot snapshot ,
2145
+ RepositoryData repositoryData ,
2146
+ ShardGenerations shardGenerations
2147
+ ) {
2117
2148
if (ExceptionsHelper .unwrap (e , NotMasterException .class , FailedToCommitClusterStateException .class ) != null ) {
2118
2149
// Failure due to not being master any more, don't try to remove snapshot from cluster state the next master
2119
2150
// will try ending this snapshot again
@@ -2125,7 +2156,7 @@ private void handleFinalizationFailure(Exception e, Snapshot snapshot, Repositor
2125
2156
failAllListenersOnMasterFailOver (e );
2126
2157
} else {
2127
2158
logger .warn (() -> new ParameterizedMessage ("[{}] failed to finalize snapshot" , snapshot ), e );
2128
- removeFailedSnapshotFromClusterState (snapshot , e , repositoryData , null );
2159
+ removeFailedSnapshotFromClusterState (snapshot , e , repositoryData , null , shardGenerations );
2129
2160
}
2130
2161
}
2131
2162
@@ -2251,7 +2282,7 @@ private static Tuple<ClusterState, List<SnapshotDeletionsInProgress.Entry>> read
2251
2282
* @param snapshot snapshot for which to remove the snapshot operation
2252
2283
* @return updated cluster state
2253
2284
*/
2254
- public static ClusterState stateWithoutSnapshot (ClusterState state , Snapshot snapshot ) {
2285
+ public static ClusterState stateWithoutSnapshot (ClusterState state , Snapshot snapshot , ShardGenerations shardGenerations ) {
2255
2286
final SnapshotsInProgress snapshots = state .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
2256
2287
ClusterState result = state ;
2257
2288
int indexOfEntry = -1 ;
@@ -2312,7 +2343,8 @@ public static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot sna
2312
2343
final ShardSnapshotStatus shardState = finishedShardEntry .value ;
2313
2344
final RepositoryShardId repositoryShardId = finishedShardEntry .key ;
2314
2345
if (shardState .state () != ShardState .SUCCESS
2315
- || previousEntry .shardsByRepoShardId ().containsKey (repositoryShardId ) == false ) {
2346
+ || previousEntry .shardsByRepoShardId ().containsKey (repositoryShardId ) == false
2347
+ || shardGenerations .hasShardGen (finishedShardEntry .key ) == false ) {
2316
2348
continue ;
2317
2349
}
2318
2350
updatedShardAssignments = maybeAddUpdatedAssignment (
@@ -2329,7 +2361,8 @@ public static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot sna
2329
2361
.shardsByRepoShardId ()) {
2330
2362
final ShardSnapshotStatus shardState = finishedShardEntry .value ;
2331
2363
if (shardState .state () == ShardState .SUCCESS
2332
- && previousEntry .shardsByRepoShardId ().containsKey (finishedShardEntry .key )) {
2364
+ && previousEntry .shardsByRepoShardId ().containsKey (finishedShardEntry .key )
2365
+ && shardGenerations .hasShardGen (finishedShardEntry .key )) {
2333
2366
updatedShardAssignments = maybeAddUpdatedAssignment (
2334
2367
updatedShardAssignments ,
2335
2368
shardState ,
@@ -2417,14 +2450,15 @@ private void removeFailedSnapshotFromClusterState(
2417
2450
Snapshot snapshot ,
2418
2451
Exception failure ,
2419
2452
@ Nullable RepositoryData repositoryData ,
2420
- @ Nullable CleanupAfterErrorListener listener
2453
+ @ Nullable CleanupAfterErrorListener listener ,
2454
+ ShardGenerations shardGenerations
2421
2455
) {
2422
2456
assert failure != null : "Failure must be supplied" ;
2423
2457
clusterService .submitStateUpdateTask ("remove snapshot metadata" , new ClusterStateUpdateTask () {
2424
2458
2425
2459
@ Override
2426
2460
public ClusterState execute (ClusterState currentState ) {
2427
- final ClusterState updatedState = stateWithoutSnapshot (currentState , snapshot );
2461
+ final ClusterState updatedState = stateWithoutSnapshot (currentState , snapshot , shardGenerations );
2428
2462
assert updatedState == currentState || endingSnapshots .contains (snapshot )
2429
2463
: "did not track [" + snapshot + "] in ending snapshots while removing it from the cluster state" ;
2430
2464
// now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them
0 commit comments