Skip to content

Commit e8c27a5

Browse files
Allow missing shard stats for restarted nodes for _snapshot/_status
Returns an empty shard stats for shard entries where stats were unavailable in the case where a node has been restarted or left the cluster. The change adds a 'description' field to the SnapshotIndexShardStatus class that is used to include a message indicating why the stats are empty. This change was motivated by a desire to reduce latency for getting the stats for currently running snapshots. The stats can still be loaded from the repository via a _snapshot/<repository>/snapshot/_status call. Closes ES-10982
1 parent 01a9620 commit e8c27a5

File tree

6 files changed

+206
-22
lines changed

6 files changed

+206
-22
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@ public void testGetSnapshotsWithoutIndices() throws Exception {
212212
* 1. Start snapshot of two shards (both located on separate data nodes).
213213
* 2. Have one of the shards snapshot completely and the other block
214214
* 3. Restart the data node that completed its shard snapshot
215-
* 4. Make sure that snapshot status APIs show correct file-counts and -sizes
215+
* 4. Make sure that snapshot status APIs show correct file-counts and -sizes for non-restarted nodes
216+
* 5. Make sure the description string is set for shard snapshots on restarted nodes.
216217
*
217218
* @throws Exception on failure
218219
*/
@@ -261,8 +262,9 @@ public void testCorrectCountsForDoneShards() throws Exception {
261262
indexTwo
262263
);
263264
assertThat(snapshotShardStateAfterNodeRestart.getStage(), is(SnapshotIndexShardStage.DONE));
264-
assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalFileCount(), equalTo(totalFiles));
265-
assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalSize(), equalTo(totalFileSize));
265+
assertNotNull("expected a description string for missing stats", snapshotShardStateAfterNodeRestart.getDescription());
266+
assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalFileCount(), equalTo(0));
267+
assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalSize(), equalTo(0L));
266268

267269
unblockAllDataNodes(repoName);
268270
assertThat(responseSnapshotOne.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS));

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ static TransportVersion def(int id) {
259259
public static final TransportVersion ESQL_TIME_SERIES_SOURCE_STATUS = def(9_076_0_00);
260260
public static final TransportVersion ESQL_HASH_OPERATOR_STATUS_OUTPUT_TIME = def(9_077_0_00);
261261
public static final TransportVersion ML_INFERENCE_HUGGING_FACE_CHAT_COMPLETION_ADDED = def(9_078_0_00);
262+
public static final TransportVersion SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED = def(9_079_0_00);
262263

263264
/*
264265
* STOP! READ THIS FIRST! No, really,

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import java.io.IOException;
2222
import java.util.Objects;
2323

24+
import static org.elasticsearch.TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED;
25+
2426
public class SnapshotIndexShardStatus extends BroadcastShardResponse implements ToXContentFragment {
2527

2628
private final SnapshotIndexShardStage stage;
@@ -31,12 +33,17 @@ public class SnapshotIndexShardStatus extends BroadcastShardResponse implements
3133

3234
private String failure;
3335

36+
private String description;
37+
3438
public SnapshotIndexShardStatus(StreamInput in) throws IOException {
3539
super(in);
3640
stage = SnapshotIndexShardStage.fromValue(in.readByte());
3741
stats = new SnapshotStats(in);
3842
nodeId = in.readOptionalString();
3943
failure = in.readOptionalString();
44+
if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) {
45+
description = in.readOptionalString();
46+
}
4047
}
4148

4249
SnapshotIndexShardStatus(ShardId shardId, SnapshotIndexShardStage stage) {
@@ -74,11 +81,23 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException {
7481
}
7582

7683
SnapshotIndexShardStatus(ShardId shardId, SnapshotIndexShardStage stage, SnapshotStats stats, String nodeId, String failure) {
84+
this(shardId, stage, stats, nodeId, failure, null);
85+
}
86+
87+
SnapshotIndexShardStatus(
88+
ShardId shardId,
89+
SnapshotIndexShardStage stage,
90+
SnapshotStats stats,
91+
String nodeId,
92+
String failure,
93+
String description
94+
) {
7795
super(shardId);
7896
this.stage = stage;
7997
this.stats = stats;
8098
this.nodeId = nodeId;
8199
this.failure = failure;
100+
this.description = description;
82101
}
83102

84103
/**
@@ -109,19 +128,30 @@ public String getFailure() {
109128
return failure;
110129
}
111130

131+
/**
132+
* Returns the optional description of the data values contained in the {@code stats} field.
133+
*/
134+
public String getDescription() {
135+
return description;
136+
}
137+
112138
@Override
113139
public void writeTo(StreamOutput out) throws IOException {
114140
super.writeTo(out);
115141
out.writeByte(stage.value());
116142
stats.writeTo(out);
117143
out.writeOptionalString(nodeId);
118144
out.writeOptionalString(failure);
145+
if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) {
146+
out.writeOptionalString(description);
147+
}
119148
}
120149

121150
static final class Fields {
122151
static final String STAGE = "stage";
123152
static final String REASON = "reason";
124153
static final String NODE = "node";
154+
static final String DESCRIPTION = "description";
125155
}
126156

127157
@Override
@@ -135,6 +165,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
135165
if (getFailure() != null) {
136166
builder.field(Fields.REASON, getFailure());
137167
}
168+
if (getDescription() != null) {
169+
builder.field(Fields.DESCRIPTION, getDescription());
170+
}
138171
builder.endObject();
139172
return builder;
140173
}
@@ -151,7 +184,8 @@ public boolean equals(Object o) {
151184
return stage == that.stage
152185
&& Objects.equals(stats, that.stats)
153186
&& Objects.equals(nodeId, that.nodeId)
154-
&& Objects.equals(failure, that.failure);
187+
&& Objects.equals(failure, that.failure)
188+
&& Objects.equals(description, that.description);
155189
}
156190

157191
@Override
@@ -160,6 +194,7 @@ public int hashCode() {
160194
result = 31 * result + (stats != null ? stats.hashCode() : 0);
161195
result = 31 * result + (nodeId != null ? nodeId.hashCode() : 0);
162196
result = 31 * result + (failure != null ? failure.hashCode() : 0);
197+
result = 31 * result + (description != null ? description.hashCode() : 0);
163198
return result;
164199
}
165200

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -196,13 +196,13 @@ void buildResponse(
196196
}
197197
SnapshotsInProgress.ShardSnapshotStatus status = shardEntry.getValue();
198198
if (status.nodeId() != null) {
199-
// We should have information about this shard from the shard:
199+
// We should have information about this shard from the node:
200200
TransportNodesSnapshotsStatus.NodeSnapshotStatus nodeStatus = nodeSnapshotStatusMap.get(status.nodeId());
201201
if (nodeStatus != null) {
202-
Map<ShardId, SnapshotIndexShardStatus> shardStatues = nodeStatus.status().get(entry.snapshot());
203-
if (shardStatues != null) {
202+
Map<ShardId, SnapshotIndexShardStatus> shardStatuses = nodeStatus.status().get(entry.snapshot());
203+
if (shardStatuses != null) {
204204
final ShardId sid = entry.shardId(shardEntry.getKey());
205-
SnapshotIndexShardStatus shardStatus = shardStatues.get(sid);
205+
SnapshotIndexShardStatus shardStatus = shardStatuses.get(sid);
206206
if (shardStatus != null) {
207207
// We have full information about this shard
208208
if (shardStatus.getStage() == SnapshotIndexShardStage.DONE
@@ -228,26 +228,25 @@ void buildResponse(
228228
}
229229
// We failed to find the status of the shard from the responses we received from data nodes.
230230
// This can happen if nodes drop out of the cluster completely or restart during the snapshot.
231-
// We rebuild the information they would have provided from their in memory state from the cluster
232-
// state and the repository contents in the below logic
233231
final SnapshotIndexShardStage stage = switch (shardEntry.getValue().state()) {
234232
case FAILED, ABORTED, MISSING -> SnapshotIndexShardStage.FAILURE;
235233
case INIT, WAITING, PAUSED_FOR_NODE_REMOVAL, QUEUED -> SnapshotIndexShardStage.STARTED;
236234
case SUCCESS -> SnapshotIndexShardStage.DONE;
237235
};
238236
final SnapshotIndexShardStatus shardStatus;
239237
if (stage == SnapshotIndexShardStage.DONE) {
240-
// Shard snapshot completed successfully so we should be able to load the exact statistics for this
241-
// shard from the repository already.
242-
final ShardId shardId = entry.shardId(shardEntry.getKey());
238+
// When processing currently running snapshots, instead of reading the statistics from the repository, which can be
239+
// expensive, we choose instead to provide a message to the caller explaining why the stats are missing and the API
240+
// that can be used to load them.
243241
shardStatus = new SnapshotIndexShardStatus(
244-
shardId,
245-
repositoriesService.repository(entry.repository())
246-
.getShardSnapshotStatus(
247-
entry.snapshot().getSnapshotId(),
248-
entry.indices().get(shardId.getIndexName()),
249-
shardId
250-
)
242+
entry.shardId(shardEntry.getKey()),
243+
stage,
244+
new SnapshotStats(),
245+
null,
246+
null,
247+
"""
248+
Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \
249+
completing the snapshot; use /_snapshot/<repository>/<snapshot>/_status to load from the repository."""
251250
);
252251
} else {
253252
shardStatus = new SnapshotIndexShardStatus(entry.shardId(shardEntry.getKey()), stage);

server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,13 @@ protected SnapshotIndexShardStatus createForIndex(String indexName) {
3939
SnapshotStats stats = new SnapshotStatsTests().createTestInstance();
4040
String nodeId = randomAlphaOfLength(20);
4141
String failure = null;
42+
String description = null;
4243
if (rarely()) {
4344
failure = randomAlphaOfLength(200);
45+
} else if (rarely()) {
46+
description = randomAlphaOfLength(200);
4447
}
45-
return new SnapshotIndexShardStatus(shardId, stage, stats, nodeId, failure);
48+
return new SnapshotIndexShardStatus(shardId, stage, stats, nodeId, failure, description);
4649
}
4750

4851
@Override
@@ -76,6 +79,7 @@ protected boolean supportsUnknownFields() {
7679
String rawStage = (String) parsedObjects[i++];
7780
String nodeId = (String) parsedObjects[i++];
7881
String failure = (String) parsedObjects[i++];
82+
String description = (String) parsedObjects[i++];
7983
SnapshotStats stats = (SnapshotStats) parsedObjects[i];
8084

8185
SnapshotIndexShardStage stage;
@@ -89,12 +93,13 @@ protected boolean supportsUnknownFields() {
8993
rawStage
9094
);
9195
}
92-
return new SnapshotIndexShardStatus(shard, stage, stats, nodeId, failure);
96+
return new SnapshotIndexShardStatus(shard, stage, stats, nodeId, failure, description);
9397
}
9498
);
9599
innerParser.declareString(constructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.STAGE));
96100
innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.NODE));
97101
innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.REASON));
102+
innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.DESCRIPTION));
98103
innerParser.declareObject(constructorArg(), (p, c) -> SnapshotStats.fromXContent(p), new ParseField(SnapshotStats.Fields.STATS));
99104
PARSER = (p, indexId, shardName) -> {
100105
// Combine the index name in the context with the shard name passed in for the named object parser

server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,16 @@
1414
import org.elasticsearch.client.internal.node.NodeClient;
1515
import org.elasticsearch.cluster.SnapshotsInProgress;
1616
import org.elasticsearch.cluster.metadata.ProjectId;
17+
import org.elasticsearch.cluster.node.DiscoveryNode;
1718
import org.elasticsearch.cluster.service.ClusterService;
19+
import org.elasticsearch.common.transport.TransportAddress;
20+
import org.elasticsearch.common.unit.ByteSizeValue;
1821
import org.elasticsearch.index.IndexVersion;
1922
import org.elasticsearch.index.shard.ShardId;
2023
import org.elasticsearch.repositories.IndexId;
2124
import org.elasticsearch.repositories.RepositoriesService;
2225
import org.elasticsearch.repositories.ShardGeneration;
26+
import org.elasticsearch.repositories.ShardSnapshotResult;
2327
import org.elasticsearch.snapshots.Snapshot;
2428
import org.elasticsearch.snapshots.SnapshotId;
2529
import org.elasticsearch.tasks.CancellableTask;
@@ -200,4 +204,142 @@ public void onFailure(Exception e) {
200204
);
201205
assertTrue("Expected listener to be invoked", listenerInvoked.get());
202206
}
207+
208+
public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() {
209+
final var snapshot = new Snapshot(ProjectId.DEFAULT, "test-repo", new SnapshotId("snapshot", "uuid"));
210+
final var indexName = "test-index-name";
211+
final var indexUuid = "test-index-uuid";
212+
final var shardGeneration = new ShardGeneration("gen");
213+
final var shardId2 = new ShardId(indexName, indexUuid, 2);
214+
final var nowMsecs = System.currentTimeMillis();
215+
final var eightKb = ByteSizeValue.ofKb(8).getBytes();
216+
217+
final var currentSnapshotEntries = List.of(
218+
SnapshotsInProgress.Entry.snapshot(
219+
snapshot,
220+
randomBoolean(),
221+
randomBoolean(),
222+
SnapshotsInProgress.State.STARTED,
223+
Map.of(indexName, new IndexId(indexName, indexUuid)),
224+
List.of(),
225+
List.of(),
226+
randomNonNegativeLong(),
227+
randomNonNegativeLong(),
228+
Map.of(
229+
new ShardId(indexName, indexUuid, 0),
230+
SnapshotsInProgress.ShardSnapshotStatus.success(
231+
"nodeId0",
232+
new ShardSnapshotResult(shardGeneration, ByteSizeValue.ofKb(5), 1)
233+
),
234+
new ShardId(indexName, indexUuid, 1),
235+
new SnapshotsInProgress.ShardSnapshotStatus("nodeId1", shardGeneration),
236+
shardId2,
237+
SnapshotsInProgress.ShardSnapshotStatus.success(
238+
"nodeId2",
239+
new ShardSnapshotResult(shardGeneration, ByteSizeValue.ofKb(8), 1)
240+
)
241+
),
242+
null,
243+
Map.of(),
244+
IndexVersion.current()
245+
)
246+
);
247+
final var nodeSnapshotStatuses = new TransportNodesSnapshotsStatus.NodesSnapshotStatus(
248+
clusterService.getClusterName(),
249+
List.of(
250+
new TransportNodesSnapshotsStatus.NodeSnapshotStatus(
251+
new DiscoveryNode(
252+
"nodeName0",
253+
"nodeId0",
254+
new TransportAddress(TransportAddress.META_ADDRESS, 9000),
255+
Map.of(),
256+
Set.of(),
257+
null
258+
),
259+
// Here we are missing the snapshot data for the shard on this node.
260+
Map.of()
261+
),
262+
new TransportNodesSnapshotsStatus.NodeSnapshotStatus(
263+
new DiscoveryNode(
264+
"nodeName2",
265+
"nodeId2",
266+
new TransportAddress(TransportAddress.META_ADDRESS, 9002),
267+
Map.of(),
268+
Set.of(),
269+
null
270+
),
271+
Map.of(
272+
snapshot,
273+
Map.of(
274+
shardId2,
275+
new SnapshotIndexShardStatus(
276+
new ShardId(indexName, indexUuid, 2),
277+
SnapshotIndexShardStage.DONE,
278+
new SnapshotStats(nowMsecs, 0, 1, 1, 1, eightKb, eightKb, eightKb),
279+
"nodeId2",
280+
null
281+
)
282+
)
283+
)
284+
)
285+
),
286+
List.of()
287+
);
288+
289+
final Consumer<SnapshotsStatusResponse> verifyResponse = rsp -> {
290+
assertNotNull(rsp);
291+
final var snapshotStatuses = rsp.getSnapshots();
292+
assertNotNull(snapshotStatuses);
293+
assertEquals(
294+
"expected 1 snapshot status, got " + snapshotStatuses.size() + ": " + snapshotStatuses,
295+
1,
296+
snapshotStatuses.size()
297+
);
298+
final var snapshotStatus = snapshotStatuses.getFirst();
299+
assertEquals(SnapshotsInProgress.State.STARTED, snapshotStatus.getState());
300+
final var shardStats = snapshotStatus.getShardsStats();
301+
assertNotNull("expected non-null shard stats for SnapshotStatus: " + snapshotStatus, shardStats);
302+
assertEquals(new SnapshotShardsStats(0, 1 /* started */, 0, 2 /* done */, 0, 3 /* total */), shardStats);
303+
final var totalStats = snapshotStatus.getStats();
304+
assertNotNull("expected non-null total stats for SnapshotStatus: " + snapshotStatus, snapshotStatus);
305+
assertEquals("expected total file count to be 1 in the stats: " + totalStats, 1, totalStats.getTotalFileCount());
306+
assertEquals("expected total size to be " + eightKb + " in the stats: " + totalStats, eightKb, totalStats.getTotalSize());
307+
final var snapshotStatusIndices = snapshotStatus.getIndices();
308+
assertNotNull("expected a non-null map from getIndices() from SnapshotStatus: " + snapshotStatus, snapshotStatusIndices);
309+
final var snapshotIndexStatus = snapshotStatusIndices.get(indexName);
310+
assertNotNull(
311+
"no entry for indexName [" + indexName + "] found in snapshotStatusIndices: " + snapshotStatusIndices,
312+
snapshotIndexStatus
313+
);
314+
final var shardMap = snapshotIndexStatus.getShards();
315+
assertNotNull("expected a non-null shard map for SnapshotIndexStatus: " + snapshotIndexStatus, shardMap);
316+
final var shard0Entry = shardMap.get(0);
317+
assertNotNull("no entry for shard 0 found in indexName [" + indexName + "] shardMap: " + shardMap, shard0Entry);
318+
assertNotNull("expected a description string for shard 0 with missing stats from node0", shard0Entry.getDescription());
319+
};
320+
321+
final var listener = new ActionListener<SnapshotsStatusResponse>() {
322+
@Override
323+
public void onResponse(SnapshotsStatusResponse rsp) {
324+
verifyResponse.accept(rsp);
325+
}
326+
327+
@Override
328+
public void onFailure(Exception e) {
329+
fail("expected onResponse() instead of onFailure(" + e + ")");
330+
}
331+
};
332+
333+
final var listenerInvoked = new AtomicBoolean(false);
334+
335+
action.buildResponse(
336+
SnapshotsInProgress.EMPTY,
337+
new SnapshotsStatusRequest(TEST_REQUEST_TIMEOUT),
338+
currentSnapshotEntries,
339+
nodeSnapshotStatuses,
340+
new CancellableTask(randomLong(), "type", "action", "desc", null, Map.of()),
341+
ActionListener.runAfter(listener, () -> listenerInvoked.set(true))
342+
);
343+
assertTrue("Expected listener to be invoked", listenerInvoked.get());
344+
}
203345
}

0 commit comments

Comments
 (0)