Skip to content

Commit 7fb6ca4

Browse files
authored
Add ephemeral node id to shutdown metadata (#118722)
Shutdown metadata is keyed on node id. This makes sense since only one node with a given node id can exist within a cluster. However, it is possible that shutdown was initiated for one instance of a node, but that node is restarted. This commit adds the ephemeral node id to shutdown metadata so that nodes with the same id but different ephemeral id can be distinguished.
1 parent 12acc89 commit 7fb6ca4

File tree

28 files changed

+133
-36
lines changed

28 files changed

+133
-36
lines changed

docs/reference/shutdown/apis/shutdown-get.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ including the status of shard migration, task migration, and plugin cleanup:
7272
"nodes": [
7373
{
7474
"node_id": "USpTGYaBSIKbgSUJR2Z9lg",
75+
"node_ephemeral_id": null,
7576
"type": "RESTART",
7677
"reason": "Demonstrating how the node shutdown API works",
7778
"shutdown_startedmillis": 1624406108685,

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -753,11 +753,16 @@ private static void putShutdownMetadata(
753753
clusterService.submitUnbatchedStateUpdateTask("mark node for removal", new ClusterStateUpdateTask() {
754754
@Override
755755
public ClusterState execute(ClusterState currentState) {
756-
final var nodeId = currentState.nodes().resolveNode(nodeName).getId();
756+
final var node = currentState.nodes().resolveNode(nodeName);
757757
return currentState.copyAndUpdateMetadata(
758758
mdb -> mdb.putCustom(
759759
NodesShutdownMetadata.TYPE,
760-
new NodesShutdownMetadata(Map.of(nodeId, shutdownMetadataBuilder.setNodeId(nodeId).build()))
760+
new NodesShutdownMetadata(
761+
Map.of(
762+
node.getId(),
763+
shutdownMetadataBuilder.setNodeId(node.getId()).setNodeEphemeralId(node.getEphemeralId()).build()
764+
)
765+
)
761766
)
762767
);
763768
}

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStressTestsIT.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,15 +1235,16 @@ public ClusterState execute(ClusterState currentState) {
12351235
Strings.toString(currentState),
12361236
currentState.metadata().nodeShutdowns().getAll().isEmpty()
12371237
);
1238-
final var nodeId = currentState.nodes().resolveNode(node.nodeName).getId();
1238+
final var discoveryNode = currentState.nodes().resolveNode(node.nodeName);
12391239
return currentState.copyAndUpdateMetadata(
12401240
mdb -> mdb.putCustom(
12411241
NodesShutdownMetadata.TYPE,
12421242
new NodesShutdownMetadata(
12431243
Map.of(
1244-
nodeId,
1244+
discoveryNode.getId(),
12451245
SingleNodeShutdownMetadata.builder()
1246-
.setNodeId(nodeId)
1246+
.setNodeId(discoveryNode.getId())
1247+
.setNodeEphemeralId(discoveryNode.getEphemeralId())
12471248
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
12481249
.setStartedAtMillis(clusterService.threadPool().absoluteTimeInMillis())
12491250
.setReason("test")

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ static TransportVersion def(int id) {
147147
public static final TransportVersion FAILURE_STORE_ENABLED_BY_CLUSTER_SETTING = def(8_812_00_0);
148148
public static final TransportVersion SIMULATE_IGNORED_FIELDS = def(8_813_00_0);
149149
public static final TransportVersion TRANSFORMS_UPGRADE_MODE = def(8_814_00_0);
150+
public static final TransportVersion NODE_SHUTDOWN_EPHEMERAL_ID_ADDED = def(8_815_00_0);
150151

151152
/*
152153
* STOP! READ THIS FIRST! No, really,

server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.Locale;
2929
import java.util.Objects;
3030

31+
import static org.elasticsearch.TransportVersions.NODE_SHUTDOWN_EPHEMERAL_ID_ADDED;
3132
import static org.elasticsearch.core.Strings.format;
3233

3334
/**
@@ -40,6 +41,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
4041
public static final TransportVersion GRACE_PERIOD_ADDED_VERSION = TransportVersions.V_8_9_X;
4142

4243
public static final ParseField NODE_ID_FIELD = new ParseField("node_id");
44+
public static final ParseField NODE_EPHEMERAL_ID_FIELD = new ParseField("node_ephemeral_id");
4345
public static final ParseField TYPE_FIELD = new ParseField("type");
4446
public static final ParseField REASON_FIELD = new ParseField("reason");
4547
public static final String STARTED_AT_READABLE_FIELD = "shutdown_started";
@@ -53,18 +55,25 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
5355
"node_shutdown_info",
5456
a -> new SingleNodeShutdownMetadata(
5557
(String) a[0],
56-
Type.valueOf((String) a[1]),
57-
(String) a[2],
58-
(long) a[3],
59-
(boolean) a[4],
60-
(TimeValue) a[5],
61-
(String) a[6],
62-
(TimeValue) a[7]
58+
(String) a[1],
59+
Type.valueOf((String) a[2]),
60+
(String) a[3],
61+
(long) a[4],
62+
(boolean) a[5],
63+
(TimeValue) a[6],
64+
(String) a[7],
65+
(TimeValue) a[8]
6366
)
6467
);
6568

6669
static {
6770
PARSER.declareString(ConstructingObjectParser.constructorArg(), NODE_ID_FIELD);
71+
PARSER.declareField(
72+
ConstructingObjectParser.optionalConstructorArg(),
73+
(p, c) -> p.textOrNull(),
74+
NODE_EPHEMERAL_ID_FIELD,
75+
ObjectParser.ValueType.STRING_OR_NULL
76+
);
6877
PARSER.declareString(ConstructingObjectParser.constructorArg(), TYPE_FIELD);
6978
PARSER.declareString(ConstructingObjectParser.constructorArg(), REASON_FIELD);
7079
PARSER.declareLong(ConstructingObjectParser.constructorArg(), STARTED_AT_MILLIS_FIELD);
@@ -91,6 +100,8 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
91100
public static final TimeValue DEFAULT_RESTART_SHARD_ALLOCATION_DELAY = TimeValue.timeValueMinutes(5);
92101

93102
private final String nodeId;
103+
@Nullable
104+
private final String nodeEphemeralId;
94105
private final Type type;
95106
private final String reason;
96107
private final long startedAtMillis;
@@ -110,6 +121,7 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
110121
*/
111122
private SingleNodeShutdownMetadata(
112123
String nodeId,
124+
@Nullable String nodeEphemeralId,
113125
Type type,
114126
String reason,
115127
long startedAtMillis,
@@ -119,6 +131,7 @@ private SingleNodeShutdownMetadata(
119131
@Nullable TimeValue gracePeriod
120132
) {
121133
this.nodeId = Objects.requireNonNull(nodeId, "node ID must not be null");
134+
this.nodeEphemeralId = nodeEphemeralId;
122135
this.type = Objects.requireNonNull(type, "shutdown type must not be null");
123136
this.reason = Objects.requireNonNull(reason, "shutdown reason must not be null");
124137
this.startedAtMillis = startedAtMillis;
@@ -157,6 +170,11 @@ private SingleNodeShutdownMetadata(
157170

158171
public SingleNodeShutdownMetadata(StreamInput in) throws IOException {
159172
this.nodeId = in.readString();
173+
if (in.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
174+
this.nodeEphemeralId = in.readOptionalString();
175+
} else {
176+
this.nodeEphemeralId = null; // empty when talking to old nodes, meaning the persistent node id is the only differentiator
177+
}
160178
this.type = in.readEnum(Type.class);
161179
this.reason = in.readString();
162180
this.startedAtMillis = in.readVLong();
@@ -181,6 +199,15 @@ public String getNodeId() {
181199
return nodeId;
182200
}
183201

202+
/**
203+
* @return The ephemeral ID of the node this {@link SingleNodeShutdownMetadata} concerns, or
204+
* {@code null} if the ephemeral id is unknown.
205+
*/
206+
@Nullable
207+
public String getNodeEphemeralId() {
208+
return nodeEphemeralId;
209+
}
210+
184211
/**
185212
* @return The type of shutdown this is (shutdown vs. permanent).
186213
*/
@@ -241,6 +268,9 @@ public TimeValue getGracePeriod() {
241268
@Override
242269
public void writeTo(StreamOutput out) throws IOException {
243270
out.writeString(nodeId);
271+
if (out.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
272+
out.writeOptionalString(nodeEphemeralId);
273+
}
244274
if ((out.getTransportVersion().before(REPLACE_SHUTDOWN_TYPE_ADDED_VERSION) && this.type == SingleNodeShutdownMetadata.Type.REPLACE)
245275
|| (out.getTransportVersion().before(SIGTERM_ADDED_VERSION) && this.type == Type.SIGTERM)) {
246276
out.writeEnum(SingleNodeShutdownMetadata.Type.REMOVE);
@@ -264,6 +294,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
264294
builder.startObject();
265295
{
266296
builder.field(NODE_ID_FIELD.getPreferredName(), nodeId);
297+
builder.field(NODE_EPHEMERAL_ID_FIELD.getPreferredName(), nodeEphemeralId);
267298
builder.field(TYPE_FIELD.getPreferredName(), type);
268299
builder.field(REASON_FIELD.getPreferredName(), reason);
269300
builder.timestampFieldsFromUnixEpochMillis(
@@ -295,6 +326,7 @@ public boolean equals(Object o) {
295326
return getStartedAtMillis() == that.getStartedAtMillis()
296327
&& getNodeSeen() == that.getNodeSeen()
297328
&& getNodeId().equals(that.getNodeId())
329+
&& Objects.equals(getNodeEphemeralId(), that.getNodeEphemeralId())
298330
&& getType() == that.getType()
299331
&& getReason().equals(that.getReason())
300332
&& Objects.equals(getAllocationDelay(), that.getAllocationDelay())
@@ -306,6 +338,7 @@ && getReason().equals(that.getReason())
306338
public int hashCode() {
307339
return Objects.hash(
308340
getNodeId(),
341+
getNodeEphemeralId(),
309342
getType(),
310343
getReason(),
311344
getStartedAtMillis(),
@@ -322,6 +355,8 @@ public String toString() {
322355
stringBuilder.append("{")
323356
.append("nodeId=[")
324357
.append(nodeId)
358+
.append("], nodeEphemeralId=[")
359+
.append(nodeEphemeralId)
325360
.append(']')
326361
.append(", type=[")
327362
.append(type)
@@ -350,6 +385,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {
350385
return builder();
351386
}
352387
return new Builder().setNodeId(original.getNodeId())
388+
.setNodeEphemeralId(original.getNodeEphemeralId())
353389
.setType(original.getType())
354390
.setReason(original.getReason())
355391
.setStartedAtMillis(original.getStartedAtMillis())
@@ -359,6 +395,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {
359395

360396
public static class Builder {
361397
private String nodeId;
398+
private String nodeEphemeralId;
362399
private Type type;
363400
private String reason;
364401
private long startedAtMillis = -1;
@@ -378,6 +415,15 @@ public Builder setNodeId(String nodeId) {
378415
return this;
379416
}
380417

418+
/**
419+
* @param nodeEphemeralId The node ephemeral ID this metadata refers to.
420+
* @return This builder.
421+
*/
422+
public Builder setNodeEphemeralId(String nodeEphemeralId) {
423+
this.nodeEphemeralId = nodeEphemeralId;
424+
return this;
425+
}
426+
381427
/**
382428
* @param type The type of shutdown.
383429
* @return This builder.
@@ -444,6 +490,7 @@ public SingleNodeShutdownMetadata build() {
444490

445491
return new SingleNodeShutdownMetadata(
446492
nodeId,
493+
nodeEphemeralId,
447494
type,
448495
reason,
449496
startedAtMillis,

server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public void testIsNodeShuttingDown() {
7878
"this_node",
7979
SingleNodeShutdownMetadata.builder()
8080
.setNodeId("this_node")
81+
.setNodeEphemeralId("this_node")
8182
.setReason("shutdown for a unit test")
8283
.setType(type)
8384
.setStartedAtMillis(randomNonNegativeLong())
@@ -106,6 +107,7 @@ public void testIsNodeShuttingDown() {
106107
public void testSigtermIsRemoveInOlderVersions() throws IOException {
107108
SingleNodeShutdownMetadata metadata = SingleNodeShutdownMetadata.builder()
108109
.setNodeId("myid")
110+
.setNodeEphemeralId("myid")
109111
.setType(SingleNodeShutdownMetadata.Type.SIGTERM)
110112
.setReason("myReason")
111113
.setStartedAtMillis(0L)
@@ -127,6 +129,7 @@ public void testIsNodeMarkedForRemoval() {
127129
SingleNodeShutdownMetadata.Type type;
128130
SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
129131
.setNodeId("thenode")
132+
.setNodeEphemeralId("thenode")
130133
.setReason("myReason")
131134
.setStartedAtMillis(0L);
132135
switch (type = randomFrom(SingleNodeShutdownMetadata.Type.values())) {
@@ -182,6 +185,7 @@ private SingleNodeShutdownMetadata randomNodeShutdownInfo() {
182185
final SingleNodeShutdownMetadata.Type type = randomFrom(SingleNodeShutdownMetadata.Type.values());
183186
final SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
184187
.setNodeId(randomAlphaOfLength(5))
188+
.setNodeEphemeralId(randomAlphaOfLength(5))
185189
.setType(type)
186190
.setReason(randomAlphaOfLength(5))
187191
.setStartedAtMillis(randomNonNegativeLong());

server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,7 @@ public void testRemainingDelayCalculationsWithUnrelatedShutdowns() {
638638
shutdowns = shutdowns.putSingleNodeMetadata(
639639
SingleNodeShutdownMetadata.builder()
640640
.setNodeId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
641+
.setNodeEphemeralId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
641642
.setReason(this.getTestName())
642643
.setStartedAtMillis(randomNonNegativeLong())
643644
.setType(type)
@@ -658,6 +659,7 @@ public void testRemainingDelayCalculationWhenNodeIsShuttingDownForRemoval() {
658659
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
659660
SingleNodeShutdownMetadata.builder()
660661
.setNodeId(lastNodeId)
662+
.setNodeEphemeralId(lastNodeId)
661663
.setReason(this.getTestName())
662664
.setStartedAtMillis(randomNonNegativeLong())
663665
.setType(type)
@@ -678,6 +680,7 @@ public void testRemainingDelayCalculationWhenNodeIsKnownToBeRestartingWithCustom
678680
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
679681
SingleNodeShutdownMetadata.builder()
680682
.setNodeId(lastNodeId)
683+
.setNodeEphemeralId(lastNodeId)
681684
.setReason(this.getTestName())
682685
.setStartedAtMillis(randomNonNegativeLong())
683686
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@@ -700,6 +703,7 @@ public void testRemainingDelayCalculationWhenNodeIsKnownToBeRestartingWithDefaul
700703
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
701704
SingleNodeShutdownMetadata.builder()
702705
.setNodeId(lastNodeId)
706+
.setNodeEphemeralId(lastNodeId)
703707
.setReason(this.getTestName())
704708
.setStartedAtMillis(randomNonNegativeLong())
705709
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@@ -728,6 +732,7 @@ public void testRemainingDelayUsesIndexLevelDelayIfNodeWasNotRestartingWhenShard
728732
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
729733
SingleNodeShutdownMetadata.builder()
730734
.setNodeId(lastNodeId)
735+
.setNodeEphemeralId(lastNodeId)
731736
.setReason(this.getTestName())
732737
.setStartedAtMillis(randomNonNegativeLong())
733738
.setType(SingleNodeShutdownMetadata.Type.RESTART)

server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,7 @@ protected void updateIndicesReadOnly(Set<String> indicesToUpdate, Releasable onC
909909
sourceNode,
910910
SingleNodeShutdownMetadata.builder()
911911
.setNodeId(sourceNode)
912+
.setNodeEphemeralId(sourceNode)
912913
.setReason("testing")
913914
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
914915
.setTargetNodeName(targetNode)
@@ -1293,6 +1294,7 @@ public void testSkipDiskThresholdMonitorWhenStateNotRecovered() {
12931294
"node1",
12941295
SingleNodeShutdownMetadata.builder()
12951296
.setNodeId("node1")
1297+
.setNodeEphemeralId("node1")
12961298
.setReason("testing")
12971299
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
12981300
.setTargetNodeName("node3")

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocat
922922
"node-0",
923923
SingleNodeShutdownMetadata.builder()
924924
.setNodeId("node-0")
925+
.setNodeEphemeralId("node-0")
925926
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
926927
.setTargetNodeName("node-2")
927928
.setStartedAtMillis(System.currentTimeMillis())
@@ -1280,6 +1281,7 @@ public void testShouldLogOnTooManyUndesiredAllocations() {
12801281
var builder = SingleNodeShutdownMetadata.builder()
12811282
.setType(type)
12821283
.setNodeId("data-node-1")
1284+
.setNodeEphemeralId("data-node-1")
12831285
.setStartedAtMillis(randomNonNegativeLong())
12841286
.setReason("test");
12851287
if (type.equals(SingleNodeShutdownMetadata.Type.SIGTERM)) {

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,6 +868,7 @@ public void resetDesiredBalance() {
868868
final var shutdownType = randomFrom(Type.SIGTERM, Type.REMOVE, Type.REPLACE);
869869
final var singleShutdownMetadataBuilder = SingleNodeShutdownMetadata.builder()
870870
.setNodeId(node2.getId())
871+
.setNodeEphemeralId(node2.getEphemeralId())
871872
.setReason("test")
872873
.setType(shutdownType)
873874
.setStartedAtMillis(randomNonNegativeLong());

0 commit comments

Comments
 (0)