Skip to content

Commit 99b5ed8

Browse files
authored
Check latest repoData before applying workaround for missing shardGen file (#112979)
It is expected that the old master may attempt to read a shardGen file that is deleted by the new master. This PR checks the latest repo data before applying the workaround (or throwing AssertionError) for missing shardGen files. Relates: #112337 Resolves: #112811
1 parent 56e35c9 commit 99b5ed8

File tree

2 files changed

+27
-5
lines changed

2 files changed

+27
-5
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,6 @@ tests:
204204
- class: org.elasticsearch.script.StatsSummaryTests
205205
method: testEqualsAndHashCode
206206
issue: https://github.com/elastic/elasticsearch/issues/112439
207-
- class: org.elasticsearch.snapshots.ConcurrentSnapshotsIT
208-
method: testMasterFailoverOnFinalizationLoop
209-
issue: https://github.com/elastic/elasticsearch/issues/112811
210207
- class: org.elasticsearch.xpack.ml.integration.MlJobIT
211208
method: testDeleteJobAfterMissingAliases
212209
issue: https://github.com/elastic/elasticsearch/issues/112823

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,13 @@ public void cloneShardSnapshot(
596596
existingSnapshots = tuple.v1();
597597
} else {
598598
newGen = ShardGeneration.newGeneration();
599-
existingSnapshots = buildBlobStoreIndexShardSnapshots(index, Collections.emptySet(), shardContainer, shardGeneration).v1();
599+
existingSnapshots = buildBlobStoreIndexShardSnapshots(
600+
index,
601+
shardNum,
602+
Collections.emptySet(),
603+
shardContainer,
604+
shardGeneration
605+
).v1();
600606
existingShardGen = shardGeneration;
601607
}
602608
SnapshotFiles existingTargetFiles = null;
@@ -1309,6 +1315,7 @@ protected void doRun() throws Exception {
13091315
newGen = -1L;
13101316
blobStoreIndexShardSnapshots = buildBlobStoreIndexShardSnapshots(
13111317
indexId,
1318+
shardId,
13121319
originalShardBlobs,
13131320
shardContainer,
13141321
originalRepositoryData.shardGenerations().getShardGen(indexId, shardId)
@@ -3203,6 +3210,7 @@ private void doSnapshotShard(SnapshotShardContext context) {
32033210
snapshotStatus.ensureNotAborted();
32043211
Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> tuple = buildBlobStoreIndexShardSnapshots(
32053212
context.indexId(),
3213+
shardId.id(),
32063214
blobs,
32073215
shardContainer,
32083216
generation
@@ -3848,14 +3856,15 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
38483856
blobs = shardContainer.listBlobsByPrefix(OperationPurpose.SNAPSHOT_METADATA, SNAPSHOT_INDEX_PREFIX).keySet();
38493857
}
38503858

3851-
return buildBlobStoreIndexShardSnapshots(indexId, blobs, shardContainer, shardGen).v1();
3859+
return buildBlobStoreIndexShardSnapshots(indexId, shardId, blobs, shardContainer, shardGen).v1();
38523860
}
38533861

38543862
/**
38553863
* Loads all available snapshots in the repository using the given {@code generation} or falling back to trying to determine it from
38563864
* the given list of blobs in the shard container.
38573865
*
38583866
* @param indexId {@link IndexId} identifying the corresponding index
3867+
* @param shardId The 0-based shard id, see also {@link ShardId#id()}
38593868
* @param blobs list of blobs in repository
38603869
* @param generation shard generation or {@code null} in case there was no shard generation tracked in the {@link RepositoryData} for
38613870
* this shard because its snapshot was created in a version older than
@@ -3864,6 +3873,7 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
38643873
*/
38653874
private Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> buildBlobStoreIndexShardSnapshots(
38663875
IndexId indexId,
3876+
int shardId,
38673877
Set<String> blobs,
38683878
BlobContainer shardContainer,
38693879
@Nullable ShardGeneration generation
@@ -3883,6 +3893,21 @@ private Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> buildBlobStoreIndex
38833893
generation
38843894
);
38853895
} catch (NoSuchFileException noSuchFileException) {
3896+
// Master may have concurrently mutated the shard generation. This can happen when master fails over
3897+
// which is "expected". We do not need to apply the following workaround for missing file in this case.
3898+
final RepositoryData currentRepositoryData;
3899+
try {
3900+
final long latestGeneration = latestIndexBlobId();
3901+
currentRepositoryData = getRepositoryData(latestGeneration);
3902+
} catch (Exception e) {
3903+
noSuchFileException.addSuppressed(e);
3904+
throw noSuchFileException;
3905+
}
3906+
final ShardGeneration latestShardGen = currentRepositoryData.shardGenerations().getShardGen(indexId, shardId);
3907+
if (latestShardGen == null || latestShardGen.equals(generation) == false) {
3908+
throw noSuchFileException;
3909+
}
3910+
38863911
// This shouldn't happen (absent an external force deleting blobs from the repo) but in practice we've found bugs in the way
38873912
// we manipulate shard generation UUIDs under concurrent snapshot load which can lead to incorrectly deleting a referenced
38883913
// shard-level `index-UUID` blob during finalization. We definitely want to treat this as a test failure (see the `assert`

0 commit comments

Comments
 (0)