Skip to content

Commit 1fb3a1b

Browse files
authored
More detailed logging for snapshot-based recovery (#93469)
The logic for planning a snapshot-based recovery is a little intricate and lacks much logging, making it hard to determine the reason why a snapshot-based recovery might not occur in production. This commit adds some extra logging showing details of the branches that ES takes.
1 parent d3d1bbc commit 1fb3a1b

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/plan/ShardSnapshotsService.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public ShardSnapshotsService(
7575
}
7676

7777
public void fetchLatestSnapshotsForShard(ShardId shardId, ActionListener<Optional<ShardSnapshot>> listener) {
78-
assert shardId != null : "SharId was null but a value was expected";
78+
assert shardId != null : "ShardId was null but a value was expected";
7979

8080
final RepositoriesMetadata currentReposMetadata = clusterService.state()
8181
.metadata()
@@ -104,24 +104,30 @@ public void fetchLatestSnapshotsForShard(ShardId shardId, ActionListener<Optiona
104104
client.execute(
105105
GetShardSnapshotAction.INSTANCE,
106106
request,
107-
new ThreadedActionListener<>(threadPool.generic(), listener.map(this::fetchSnapshotFiles))
107+
new ThreadedActionListener<>(
108+
threadPool.generic(),
109+
listener.map(shardSnapshotResponse -> fetchSnapshotFiles(shardId, shardSnapshotResponse))
110+
)
108111
);
109112
}
110113

111-
private Optional<ShardSnapshot> fetchSnapshotFiles(GetShardSnapshotResponse shardSnapshotResponse) {
114+
private Optional<ShardSnapshot> fetchSnapshotFiles(ShardId shardId, GetShardSnapshotResponse shardSnapshotResponse) {
112115
assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.GENERIC);
113116

114117
final Optional<ShardSnapshotInfo> latestShardSnapshotOpt = shardSnapshotResponse.getLatestShardSnapshot();
115118
if (latestShardSnapshotOpt.isEmpty()) {
119+
logger.debug("{} no latest shard snapshot found", shardId);
116120
return Optional.empty();
117121
}
118122

119123
final ShardSnapshotInfo latestShardSnapshot = latestShardSnapshotOpt.get();
120124
try {
121125
final Snapshot snapshot = latestShardSnapshot.getSnapshot();
126+
logger.debug("{} considering recovery from [{}][{}]", shardId, snapshot.getRepository(), snapshot.getSnapshotId());
122127

123128
final Repository repository = repositoriesService.repository(snapshot.getRepository());
124129
if (repository instanceof BlobStoreRepository == false) {
130+
logger.debug("{} not recovering from snapshot in non-blobstore repository [{}]", shardId, snapshot.getRepository());
125131
return Optional.empty();
126132
}
127133

server/src/main/java/org/elasticsearch/repositories/IndexSnapshotsService.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
package org.elasticsearch.repositories;
1010

11+
import org.apache.logging.log4j.LogManager;
12+
import org.apache.logging.log4j.Logger;
1113
import org.elasticsearch.action.ActionListener;
1214
import org.elasticsearch.action.StepListener;
1315
import org.elasticsearch.cluster.metadata.IndexMetadata;
@@ -28,6 +30,9 @@
2830
import java.util.Optional;
2931

3032
public class IndexSnapshotsService {
33+
34+
private static final Logger logger = LogManager.getLogger(IndexSnapshotsService.class);
35+
3136
private static final Comparator<Tuple<SnapshotId, RepositoryData.SnapshotDetails>> START_TIME_COMPARATOR = Comparator.<
3237
Tuple<SnapshotId, RepositoryData.SnapshotDetails>>comparingLong(pair -> pair.v2().getStartTimeMillis()).thenComparing(Tuple::v1);
3338

@@ -62,6 +67,7 @@ public void getLatestSuccessfulSnapshotForShard(
6267

6368
repositoryDataStepListener.whenComplete(repositoryData -> {
6469
if (repositoryData.hasIndex(indexName) == false) {
70+
logger.debug("{} repository [{}] has no snapshots of this index", shardId, repositoryName);
6571
listener.onResponse(Optional.empty());
6672
return;
6773
}
@@ -81,11 +87,13 @@ public void getLatestSuccessfulSnapshotForShard(
8187
// have the start/end date populated in RepositoryData. We could fetch all the backups and find out if there is
8288
// a valid candidate, but for simplicity we just consider that we couldn't find any valid snapshot. Existing
8389
// snapshots start/end timestamps should appear in the RepositoryData eventually.
90+
logger.debug("{} could not determine latest snapshot of this shard in repository [{}]", shardId, repositoryName);
8491
listener.onResponse(Optional.empty());
8592
return;
8693
}
8794

8895
final SnapshotId snapshotId = latestSnapshotId.get();
96+
logger.debug("{} fetching details of [{}][{}]", shardId, repositoryName, snapshotId);
8997
repository.getSnapshotInfo(
9098
snapshotId,
9199
snapshotInfoStepListener.map(
@@ -100,6 +108,7 @@ public void getLatestSuccessfulSnapshotForShard(
100108

101109
if (snapshotInfo == null || snapshotInfo.state() != SnapshotState.SUCCESS) {
102110
// We couldn't find a valid candidate
111+
logger.debug("{} failed to retrieve snapshot details from [{}]", shardId, repositoryName);
103112
listener.onResponse(Optional.empty());
104113
return;
105114
}

x-pack/plugin/snapshot-based-recoveries/src/main/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/plan/SnapshotsRecoveryPlannerService.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
import org.apache.logging.log4j.Logger;
1212
import org.elasticsearch.Version;
1313
import org.elasticsearch.action.ActionListener;
14+
import org.elasticsearch.common.Strings;
1415
import org.elasticsearch.core.Nullable;
1516
import org.elasticsearch.index.shard.ShardId;
17+
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
1618
import org.elasticsearch.index.store.Store;
1719
import org.elasticsearch.index.store.StoreFileMetadata;
1820
import org.elasticsearch.indices.recovery.RecoverySettings;
@@ -68,6 +70,7 @@ public void computeRecoveryPlan(
6870
latestSnapshotOpt -> ActionListener.completeWith(
6971
listener,
7072
() -> computeRecoveryPlanWithSnapshots(
73+
shardId,
7174
shardStateIdentifier,
7275
sourceMetadata,
7376
targetMetadata,
@@ -80,6 +83,7 @@ public void computeRecoveryPlan(
8083
}
8184

8285
private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
86+
ShardId shardId,
8387
@Nullable String shardStateIdentifier,
8488
Store.MetadataSnapshot sourceMetadata,
8589
Store.MetadataSnapshot targetMetadata,
@@ -91,6 +95,7 @@ private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
9195
List<StoreFileMetadata> filesMissingInTarget = concatLists(sourceTargetDiff.missing, sourceTargetDiff.different);
9296

9397
if (latestSnapshotOpt.isEmpty()) {
98+
logger.debug("{} no snapshot suitable for recovery found, falling back to recovery from the primary", shardId);
9499
// If we couldn't find any valid snapshots, fallback to the source
95100
return getRecoveryPlanUsingSourceNode(sourceMetadata, sourceTargetDiff, filesMissingInTarget, startingSeqNo, translogOps);
96101
}
@@ -102,6 +107,18 @@ private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
102107
&& latestSnapshot.hasDifferentPhysicalFiles(sourceMetadata)
103108
&& isSnapshotVersionCompatible(latestSnapshot)
104109
&& sourceTargetDiff.identical.isEmpty()) {
110+
111+
logger.debug(
112+
() -> Strings.format(
113+
"%s primary has changed since snapshot completed, but snapshot still looks ok for recovery: %s",
114+
shardId,
115+
latestSnapshot.getSnapshotFiles()
116+
.stream()
117+
.map(BlobStoreIndexShardSnapshot.FileInfo::physicalName)
118+
.collect(Collectors.joining(", ", "[", "]"))
119+
)
120+
);
121+
105122
// Use the current primary as a fallback if the download fails half-way
106123
ShardRecoveryPlan fallbackPlan = getRecoveryPlanUsingSourceNode(
107124
sourceMetadata,
@@ -141,6 +158,18 @@ && isSnapshotVersionCompatible(latestSnapshot)
141158
);
142159
}
143160

161+
logger.debug(
162+
() -> Strings.format(
163+
"%s attempting snapshot-based recovery of %s based on %s",
164+
shardId,
165+
snapshotFilesToRecover.getSnapshotFiles()
166+
.stream()
167+
.map(BlobStoreIndexShardSnapshot.FileInfo::physicalName)
168+
.collect(Collectors.joining(", ", "[", "]")),
169+
snapshotDiff
170+
)
171+
);
172+
144173
return new ShardRecoveryPlan(
145174
snapshotFilesToRecover,
146175
concatLists(snapshotDiff.missing, snapshotDiff.different),
@@ -183,6 +212,7 @@ private ShardRecoveryPlan getRecoveryPlanUsingSourceNode(
183212

184213
private void fetchLatestSnapshotsIgnoringErrors(ShardId shardId, boolean useSnapshots, Consumer<Optional<ShardSnapshot>> listener) {
185214
if (useSnapshots == false) {
215+
logger.debug("{} recovery will not attempt to use snapshots", shardId);
186216
listener.accept(Optional.empty());
187217
return;
188218
}

0 commit comments

Comments
 (0)