Skip to content

Commit 98ab7f8

Browse files
authored
Add workaround for missing shard gen blob (#112337)
It is currently very painful to recover from a bug which incorrectly removes a shard-level `index-UUID` blob from the repository. This commit introduces a fallback mechanism that attempts to reconstruct the missing data from other blobs in the repository. It's still a bug to need this mechanism for sure, but in many cases this mechanism will allow the repository to keep working without any need for manual surgery on its contents.
1 parent 1dfb440 commit 98ab7f8

File tree

6 files changed

+273
-40
lines changed

6 files changed

+273
-40
lines changed

docs/changelog/112337.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 112337
2+
summary: Add workaround for missing shard gen blob
3+
area: Snapshot/Restore
4+
type: enhancement
5+
issues: []

server/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/BlobStoreCorruptionIT.java

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
package org.elasticsearch.repositories.blobstore;
1010

11+
import org.apache.logging.log4j.Level;
1112
import org.elasticsearch.ElasticsearchException;
1213
import org.elasticsearch.action.ActionListener;
1314
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
@@ -22,9 +23,11 @@
2223
import org.elasticsearch.repositories.fs.FsRepository;
2324
import org.elasticsearch.snapshots.AbstractSnapshotIntegTestCase;
2425
import org.elasticsearch.snapshots.SnapshotState;
26+
import org.elasticsearch.test.MockLog;
2527
import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;
2628
import org.junit.Before;
2729

30+
import java.nio.file.Files;
2831
import java.util.ArrayList;
2932
import java.util.List;
3033

@@ -65,17 +68,51 @@ public void testCorruptionDetection() throws Exception {
6568

6669
// detect corruption by taking another snapshot
6770
if (corruptedFileType == RepositoryFileType.SHARD_GENERATION) {
68-
corruptionDetectors.add(exceptionListener -> {
69-
logger.info("--> taking another snapshot");
70-
client().admin()
71-
.cluster()
72-
.prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
73-
.setWaitForCompletion(true)
74-
.execute(exceptionListener.map(createSnapshotResponse -> {
75-
assertNotEquals(SnapshotState.SUCCESS, createSnapshotResponse.getSnapshotInfo().state());
76-
return new ElasticsearchException("create-snapshot failed as expected");
77-
}));
78-
});
71+
if (Files.exists(corruptedFile)) {
72+
corruptionDetectors.add(exceptionListener -> {
73+
logger.info("--> taking another snapshot");
74+
client().admin()
75+
.cluster()
76+
.prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
77+
.setWaitForCompletion(true)
78+
.execute(exceptionListener.map(createSnapshotResponse -> {
79+
assertNotEquals(SnapshotState.SUCCESS, createSnapshotResponse.getSnapshotInfo().state());
80+
return new ElasticsearchException("create-snapshot failed as expected");
81+
}));
82+
});
83+
} else {
84+
corruptionDetectors.add(exceptionListener -> {
85+
logger.info("--> taking another snapshot");
86+
final var mockLog = MockLog.capture(BlobStoreRepository.class);
87+
mockLog.addExpectation(
88+
new MockLog.SeenEventExpectation(
89+
"fallback message",
90+
"org.elasticsearch.repositories.blobstore.BlobStoreRepository",
91+
Level.ERROR,
92+
"index [*] shard generation [*] in ["
93+
+ repositoryName
94+
+ "][*] not found - falling back to reading all shard snapshots"
95+
)
96+
);
97+
mockLog.addExpectation(
98+
new MockLog.SeenEventExpectation(
99+
"shard blobs list",
100+
"org.elasticsearch.repositories.blobstore.BlobStoreRepository",
101+
Level.ERROR,
102+
"read shard snapshots [*] due to missing shard generation [*] for index [*] in [" + repositoryName + "][*]"
103+
)
104+
);
105+
client().admin()
106+
.cluster()
107+
.prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier())
108+
.setWaitForCompletion(true)
109+
.execute(ActionListener.releaseAfter(exceptionListener.map(createSnapshotResponse -> {
110+
assertEquals(SnapshotState.SUCCESS, createSnapshotResponse.getSnapshotInfo().state());
111+
mockLog.assertAllExpectationsMatched();
112+
return new ElasticsearchException("create-snapshot logged errors as expected");
113+
}), mockLog));
114+
});
115+
}
79116
}
80117

81118
// detect corruption by restoring the snapshot

server/src/internalClusterTest/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java

Lines changed: 112 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,26 @@
77
*/
88
package org.elasticsearch.snapshots;
99

10+
import org.apache.logging.log4j.Level;
1011
import org.elasticsearch.action.ActionRequestBuilder;
1112
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
1213
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
1314
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
1415
import org.elasticsearch.action.index.IndexRequestBuilder;
1516
import org.elasticsearch.client.internal.Client;
1617
import org.elasticsearch.cluster.ClusterState;
18+
import org.elasticsearch.cluster.SnapshotsInProgress;
1719
import org.elasticsearch.cluster.metadata.Metadata;
1820
import org.elasticsearch.cluster.metadata.RepositoriesMetadata;
21+
import org.elasticsearch.cluster.service.ClusterService;
1922
import org.elasticsearch.common.bytes.BytesReference;
2023
import org.elasticsearch.common.settings.Settings;
2124
import org.elasticsearch.common.unit.ByteSizeUnit;
2225
import org.elasticsearch.common.util.concurrent.EsExecutors;
2326
import org.elasticsearch.core.IOUtils;
2427
import org.elasticsearch.core.Strings;
2528
import org.elasticsearch.index.IndexVersion;
29+
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshotsIntegritySuppressor;
2630
import org.elasticsearch.repositories.IndexId;
2731
import org.elasticsearch.repositories.IndexMetaDataGenerations;
2832
import org.elasticsearch.repositories.Repository;
@@ -32,6 +36,8 @@
3236
import org.elasticsearch.repositories.ShardGenerations;
3337
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
3438
import org.elasticsearch.repositories.fs.FsRepository;
39+
import org.elasticsearch.test.ClusterServiceUtils;
40+
import org.elasticsearch.test.MockLog;
3541
import org.elasticsearch.xcontent.XContentFactory;
3642

3743
import java.nio.channels.SeekableByteChannel;
@@ -52,6 +58,7 @@
5258
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5359
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFileExists;
5460
import static org.hamcrest.Matchers.containsString;
61+
import static org.hamcrest.Matchers.empty;
5562
import static org.hamcrest.Matchers.equalTo;
5663
import static org.hamcrest.Matchers.greaterThan;
5764
import static org.hamcrest.Matchers.hasSize;
@@ -767,6 +774,13 @@ public void testSnapshotWithMissingShardLevelIndexFile() throws Exception {
767774
.setWaitForCompletion(true)
768775
.setIndices("test-idx-*")
769776
.get();
777+
final boolean repairWithDelete = randomBoolean();
778+
if (repairWithDelete || randomBoolean()) {
779+
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, "test-repo", "snap-for-deletion")
780+
.setWaitForCompletion(true)
781+
.setIndices("test-idx-1")
782+
.get();
783+
}
770784

771785
logger.info("--> deleting shard level index file");
772786
final Path indicesPath = repo.resolve("indices");
@@ -780,28 +794,111 @@ public void testSnapshotWithMissingShardLevelIndexFile() throws Exception {
780794
Files.delete(shardGen);
781795
}
782796

783-
logger.info("--> creating another snapshot");
797+
if (randomBoolean()) {
798+
logger.info("""
799+
--> restoring the snapshot, the repository should not have lost any shard data despite deleting index-*, \
800+
because it uses snap-*.dat files and not the index-* to determine what files to restore""");
801+
indicesAdmin().prepareDelete("test-idx-1", "test-idx-2").get();
802+
RestoreSnapshotResponse restoreSnapshotResponse = clusterAdmin().prepareRestoreSnapshot(
803+
TEST_REQUEST_TIMEOUT,
804+
"test-repo",
805+
"test-snap-1"
806+
).setWaitForCompletion(true).get();
807+
assertEquals(0, restoreSnapshotResponse.getRestoreInfo().failedShards());
808+
ensureGreen("test-idx-1", "test-idx-2");
809+
}
810+
811+
logger.info("--> creating another snapshot, which should re-create the missing file");
812+
try (
813+
var ignored = new BlobStoreIndexShardSnapshotsIntegritySuppressor();
814+
var mockLog = MockLog.capture(BlobStoreRepository.class)
815+
) {
816+
mockLog.addExpectation(
817+
new MockLog.SeenEventExpectation(
818+
"fallback message",
819+
"org.elasticsearch.repositories.blobstore.BlobStoreRepository",
820+
Level.ERROR,
821+
"index [test-idx-1/*] shard generation [*] in [test-repo][*] not found - falling back to reading all shard snapshots"
822+
)
823+
);
824+
mockLog.addExpectation(
825+
new MockLog.SeenEventExpectation(
826+
"shard blobs list",
827+
"org.elasticsearch.repositories.blobstore.BlobStoreRepository",
828+
Level.ERROR,
829+
"read shard snapshots [*] due to missing shard generation [*] for index [test-idx-1/*] in [test-repo][*]"
830+
)
831+
);
832+
if (repairWithDelete) {
833+
clusterAdmin().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, "test-repo", "snap-for-deletion").get();
834+
} else if (randomBoolean()) {
835+
CreateSnapshotResponse createSnapshotResponse = clusterAdmin().prepareCreateSnapshot(
836+
TEST_REQUEST_TIMEOUT,
837+
"test-repo",
838+
"test-snap-2"
839+
).setWaitForCompletion(true).setIndices("test-idx-1").get();
840+
assertEquals(
841+
createSnapshotResponse.getSnapshotInfo().totalShards(),
842+
createSnapshotResponse.getSnapshotInfo().successfulShards()
843+
);
844+
} else {
845+
clusterAdmin().prepareCloneSnapshot(TEST_REQUEST_TIMEOUT, "test-repo", "test-snap-1", "test-snap-2")
846+
.setIndices("test-idx-1")
847+
.get();
848+
safeAwait(
849+
ClusterServiceUtils.addTemporaryStateListener(
850+
internalCluster().getInstance(ClusterService.class),
851+
cs -> SnapshotsInProgress.get(cs).isEmpty()
852+
)
853+
);
854+
assertThat(
855+
clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, "test-repo")
856+
.setSnapshots("test-snap-2")
857+
.get()
858+
.getSnapshots()
859+
.get(0)
860+
.shardFailures(),
861+
empty()
862+
);
863+
}
864+
mockLog.assertAllExpectationsMatched();
865+
866+
try (
867+
Stream<Path> shardFiles = Files.list(
868+
indicesPath.resolve(getRepositoryData("test-repo").resolveIndexId("test-idx-1").getId()).resolve("0")
869+
)
870+
) {
871+
assertTrue(shardFiles.anyMatch(file -> file.getFileName().toString().startsWith(BlobStoreRepository.INDEX_FILE_PREFIX)));
872+
}
873+
}
874+
875+
if (randomBoolean()) {
876+
indicesAdmin().prepareDelete("test-idx-1").get();
877+
RestoreSnapshotResponse restoreSnapshotResponse2 = clusterAdmin().prepareRestoreSnapshot(
878+
TEST_REQUEST_TIMEOUT,
879+
"test-repo",
880+
repairWithDelete ? "test-snap-1" : randomFrom("test-snap-1", "test-snap-2")
881+
).setIndices("test-idx-1").setWaitForCompletion(true).get();
882+
assertEquals(0, restoreSnapshotResponse2.getRestoreInfo().failedShards());
883+
ensureGreen("test-idx-1", "test-idx-2");
884+
}
885+
886+
logger.info("--> creating another snapshot, which should succeed since the shard gen file now exists again");
784887
CreateSnapshotResponse createSnapshotResponse = clusterAdmin().prepareCreateSnapshot(
785888
TEST_REQUEST_TIMEOUT,
786889
"test-repo",
787-
"test-snap-2"
890+
"test-snap-3"
788891
).setWaitForCompletion(true).setIndices("test-idx-1").get();
789-
assertEquals(
790-
createSnapshotResponse.getSnapshotInfo().successfulShards(),
791-
createSnapshotResponse.getSnapshotInfo().totalShards() - 1
792-
);
892+
assertEquals(createSnapshotResponse.getSnapshotInfo().totalShards(), createSnapshotResponse.getSnapshotInfo().successfulShards());
793893

794-
logger.info(
795-
"--> restoring the first snapshot, the repository should not have lost any shard data despite deleting index-N, "
796-
+ "because it uses snap-*.data files and not the index-N to determine what files to restore"
797-
);
798-
indicesAdmin().prepareDelete("test-idx-1", "test-idx-2").get();
799-
RestoreSnapshotResponse restoreSnapshotResponse = clusterAdmin().prepareRestoreSnapshot(
894+
indicesAdmin().prepareDelete("test-idx-1").get();
895+
RestoreSnapshotResponse restoreSnapshotResponse3 = clusterAdmin().prepareRestoreSnapshot(
800896
TEST_REQUEST_TIMEOUT,
801897
"test-repo",
802-
"test-snap-1"
803-
).setWaitForCompletion(true).get();
804-
assertEquals(0, restoreSnapshotResponse.getRestoreInfo().failedShards());
898+
repairWithDelete ? randomFrom("test-snap-1", "test-snap-3") : randomFrom("test-snap-1", "test-snap-2", "test-snap-3")
899+
).setIndices("test-idx-1").setWaitForCompletion(true).get();
900+
assertEquals(0, restoreSnapshotResponse3.getRestoreInfo().failedShards());
901+
ensureGreen("test-idx-1", "test-idx-2");
805902
}
806903

807904
public void testDeletesWithUnexpectedIndexBlob() throws Exception {

server/src/internalClusterTest/java/org/elasticsearch/snapshots/MultiClusterRepoAccessIT.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.elasticsearch.common.util.CollectionUtils;
1313
import org.elasticsearch.core.IOUtils;
1414
import org.elasticsearch.env.Environment;
15+
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshotsIntegritySuppressor;
1516
import org.elasticsearch.plugins.Plugin;
1617
import org.elasticsearch.repositories.RepositoryException;
1718
import org.elasticsearch.test.ESIntegTestCase;
@@ -98,7 +99,7 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
9899
return CollectionUtils.appendToCopy(super.nodePlugins(), getTestTransportPlugin());
99100
}
100101

101-
public void testConcurrentDeleteFromOtherCluster() throws InterruptedException {
102+
public void testConcurrentDeleteFromOtherCluster() {
102103
internalCluster().startMasterOnlyNode();
103104
internalCluster().startDataOnlyNode();
104105
final String repoNameOnFirstCluster = "test-repo";
@@ -125,10 +126,13 @@ public void testConcurrentDeleteFromOtherCluster() throws InterruptedException {
125126
secondCluster.client().admin().cluster().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, repoNameOnSecondCluster, "snap-1").get();
126127
secondCluster.client().admin().cluster().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, repoNameOnSecondCluster, "snap-2").get();
127128

128-
final SnapshotException sne = expectThrows(
129-
SnapshotException.class,
130-
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repoNameOnFirstCluster, "snap-4").setWaitForCompletion(true)
131-
);
129+
final SnapshotException sne;
130+
try (var ignored = new BlobStoreIndexShardSnapshotsIntegritySuppressor()) {
131+
sne = expectThrows(
132+
SnapshotException.class,
133+
clusterAdmin().prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repoNameOnFirstCluster, "snap-4").setWaitForCompletion(true)
134+
);
135+
}
132136
assertThat(sne.getMessage(), containsString("failed to update snapshot in repository"));
133137
final RepositoryException cause = (RepositoryException) sne.getCause();
134138
assertThat(
@@ -147,7 +151,7 @@ public void testConcurrentDeleteFromOtherCluster() throws InterruptedException {
147151
createFullSnapshot(repoNameOnFirstCluster, "snap-5");
148152
}
149153

150-
public void testConcurrentWipeAndRecreateFromOtherCluster() throws InterruptedException, IOException {
154+
public void testConcurrentWipeAndRecreateFromOtherCluster() throws IOException {
151155
internalCluster().startMasterOnlyNode();
152156
internalCluster().startDataOnlyNode();
153157
final String repoName = "test-repo";

server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,10 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
266266

267267
static volatile boolean INTEGRITY_ASSERTIONS_ENABLED = true;
268268

269+
public static boolean areIntegrityAssertionsEnabled() {
270+
return INTEGRITY_ASSERTIONS_ENABLED;
271+
}
272+
269273
public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) throws IOException {
270274
XContentParser.Token token = parser.currentToken();
271275
if (token == null) { // New parser

0 commit comments

Comments
 (0)