Skip to content

Commit 7fef1cd

Browse files
Log a msg when shard snapshot sees interrupt (#116364)
Adds debug-level logging when a shard snapshot sees interrupt. The case we're interested in is shard snapshot pausing during shutdown (also the only time we pause snapshots). Snapshot abort will also be caught and logged if there's an async error during snapshotting, but this should be uncommon. Relates ES-8773
1 parent fc67f7c commit 7fef1cd

File tree

2 files changed

+39
-3
lines changed

2 files changed

+39
-3
lines changed

server/src/main/java/org/elasticsearch/index/snapshots/IndexShardSnapshotStatus.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,11 @@ public ShardSnapshotResult getShardSnapshotResult() {
245245
}
246246

247247
public void ensureNotAborted() {
248-
switch (stage.get()) {
248+
ensureNotAborted(stage.get());
249+
}
250+
251+
public static void ensureNotAborted(Stage shardSnapshotStage) {
252+
switch (shardSnapshotStage) {
249253
case ABORTED -> throw new AbortedSnapshotException();
250254
case PAUSING -> throw new PausedSnapshotException();
251255
}

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@
191191
public abstract class BlobStoreRepository extends AbstractLifecycleComponent implements Repository {
192192
private static final Logger logger = LogManager.getLogger(BlobStoreRepository.class);
193193

194+
private class ShutdownLogger {
195+
// Creating a separate logger so that the log-level can be manipulated separately from the parent class.
196+
private static final Logger shutdownLogger = LogManager.getLogger(ShutdownLogger.class);
197+
}
198+
194199
protected volatile RepositoryMetadata metadata;
195200

196201
protected final ThreadPool threadPool;
@@ -3467,10 +3472,37 @@ private void doSnapshotShard(SnapshotShardContext context) {
34673472
}
34683473

34693474
private static void ensureNotAborted(ShardId shardId, SnapshotId snapshotId, IndexShardSnapshotStatus snapshotStatus, String fileName) {
3475+
var shardSnapshotStage = snapshotStatus.getStage();
34703476
try {
3471-
snapshotStatus.ensureNotAborted();
3477+
IndexShardSnapshotStatus.ensureNotAborted(shardSnapshotStage);
3478+
3479+
if (shardSnapshotStage != IndexShardSnapshotStatus.Stage.INIT && shardSnapshotStage != IndexShardSnapshotStatus.Stage.STARTED) {
3480+
// A normally running shard snapshot should be in stage INIT or STARTED. And we know it's not in PAUSING or ABORTED because
3481+
// the ensureNotAborted() call above did not throw. The remaining options don't make sense, if they ever happen.
3482+
logger.error(
3483+
() -> Strings.format(
3484+
"Shard snapshot found an unexpected state. ShardId [{}], SnapshotID [{}], Stage [{}]",
3485+
shardId,
3486+
snapshotId,
3487+
shardSnapshotStage
3488+
)
3489+
);
3490+
assert false;
3491+
}
34723492
} catch (Exception e) {
3473-
logger.debug("[{}] [{}] {} on the file [{}], exiting", shardId, snapshotId, e.getMessage(), fileName);
3493+
// We want to see when a shard snapshot operation checks for and finds an interrupt signal during shutdown. A
3494+
// PausedSnapshotException indicates we're in shutdown because that's the only case when shard snapshots are signaled to pause.
3495+
// An AbortedSnapshotException may also occur during shutdown if an uncommon error occurs.
3496+
ShutdownLogger.shutdownLogger.debug(
3497+
() -> Strings.format(
3498+
"Shard snapshot operation is aborting. ShardId [%s], SnapshotID [%s], File [%s], Stage [%s]",
3499+
shardId,
3500+
snapshotId,
3501+
fileName,
3502+
shardSnapshotStage
3503+
),
3504+
e
3505+
);
34743506
assert e instanceof AbortedSnapshotException || e instanceof PausedSnapshotException : e;
34753507
throw e;
34763508
}

0 commit comments

Comments
 (0)