Skip to content

Commit 943f88f

Browse files
rjernstidegtiarenko
authored andcommitted
Add ephemeral node id to shutdown metadata (#118722)
Shutdown metadata is keyed on node id. This makes sense since only one node with a given node id can exist within a cluster. However, it is possible that shutdown was initiated for one instance of a node, but that node is restarted. This commit adds the ephemeral node id to shutdown metadata so that nodes with the same id but different ephemeral id can be distinguished. (cherry picked from commit 7fb6ca4)
1 parent bb66741 commit 943f88f

File tree

27 files changed

+149
-56
lines changed

27 files changed

+149
-56
lines changed

docs/reference/shutdown/apis/shutdown-get.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ including the status of shard migration, task migration, and plugin cleanup:
7878
"nodes": [
7979
{
8080
"node_id": "USpTGYaBSIKbgSUJR2Z9lg",
81+
"node_ephemeral_id": null,
8182
"type": "RESTART",
8283
"reason": "Demonstrating how the node shutdown API works",
8384
"shutdown_startedmillis": 1624406108685,

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -502,11 +502,16 @@ private static void putShutdownMetadata(
502502
clusterService.submitUnbatchedStateUpdateTask("mark node for removal", new ClusterStateUpdateTask() {
503503
@Override
504504
public ClusterState execute(ClusterState currentState) {
505-
final var nodeId = currentState.nodes().resolveNode(nodeName).getId();
505+
final var node = currentState.nodes().resolveNode(nodeName);
506506
return currentState.copyAndUpdateMetadata(
507507
mdb -> mdb.putCustom(
508508
NodesShutdownMetadata.TYPE,
509-
new NodesShutdownMetadata(Map.of(nodeId, shutdownMetadataBuilder.setNodeId(nodeId).build()))
509+
new NodesShutdownMetadata(
510+
Map.of(
511+
node.getId(),
512+
shutdownMetadataBuilder.setNodeId(node.getId()).setNodeEphemeralId(node.getEphemeralId()).build()
513+
)
514+
)
510515
)
511516
);
512517
}

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStressTestsIT.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,15 +1235,16 @@ public ClusterState execute(ClusterState currentState) {
12351235
Strings.toString(currentState),
12361236
currentState.metadata().nodeShutdowns().getAll().isEmpty()
12371237
);
1238-
final var nodeId = currentState.nodes().resolveNode(node.nodeName).getId();
1238+
final var discoveryNode = currentState.nodes().resolveNode(node.nodeName);
12391239
return currentState.copyAndUpdateMetadata(
12401240
mdb -> mdb.putCustom(
12411241
NodesShutdownMetadata.TYPE,
12421242
new NodesShutdownMetadata(
12431243
Map.of(
1244-
nodeId,
1244+
discoveryNode.getId(),
12451245
SingleNodeShutdownMetadata.builder()
1246-
.setNodeId(nodeId)
1246+
.setNodeId(discoveryNode.getId())
1247+
.setNodeEphemeralId(discoveryNode.getEphemeralId())
12471248
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
12481249
.setStartedAtMillis(clusterService.threadPool().absoluteTimeInMillis())
12491250
.setReason("test")

server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.Locale;
2929
import java.util.Objects;
3030

31+
import static org.elasticsearch.TransportVersions.NODE_SHUTDOWN_EPHEMERAL_ID_ADDED;
3132
import static org.elasticsearch.core.Strings.format;
3233

3334
/**
@@ -40,6 +41,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
4041
public static final TransportVersion GRACE_PERIOD_ADDED_VERSION = TransportVersions.V_8_9_X;
4142

4243
public static final ParseField NODE_ID_FIELD = new ParseField("node_id");
44+
public static final ParseField NODE_EPHEMERAL_ID_FIELD = new ParseField("node_ephemeral_id");
4345
public static final ParseField TYPE_FIELD = new ParseField("type");
4446
public static final ParseField REASON_FIELD = new ParseField("reason");
4547
public static final String STARTED_AT_READABLE_FIELD = "shutdown_started";
@@ -53,18 +55,25 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
5355
"node_shutdown_info",
5456
a -> new SingleNodeShutdownMetadata(
5557
(String) a[0],
56-
Type.valueOf((String) a[1]),
57-
(String) a[2],
58-
(long) a[3],
59-
(boolean) a[4],
60-
(TimeValue) a[5],
61-
(String) a[6],
62-
(TimeValue) a[7]
58+
(String) a[1],
59+
Type.valueOf((String) a[2]),
60+
(String) a[3],
61+
(long) a[4],
62+
(boolean) a[5],
63+
(TimeValue) a[6],
64+
(String) a[7],
65+
(TimeValue) a[8]
6366
)
6467
);
6568

6669
static {
6770
PARSER.declareString(ConstructingObjectParser.constructorArg(), NODE_ID_FIELD);
71+
PARSER.declareField(
72+
ConstructingObjectParser.optionalConstructorArg(),
73+
(p, c) -> p.textOrNull(),
74+
NODE_EPHEMERAL_ID_FIELD,
75+
ObjectParser.ValueType.STRING_OR_NULL
76+
);
6877
PARSER.declareString(ConstructingObjectParser.constructorArg(), TYPE_FIELD);
6978
PARSER.declareString(ConstructingObjectParser.constructorArg(), REASON_FIELD);
7079
PARSER.declareLong(ConstructingObjectParser.constructorArg(), STARTED_AT_MILLIS_FIELD);
@@ -91,6 +100,8 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
91100
public static final TimeValue DEFAULT_RESTART_SHARD_ALLOCATION_DELAY = TimeValue.timeValueMinutes(5);
92101

93102
private final String nodeId;
103+
@Nullable
104+
private final String nodeEphemeralId;
94105
private final Type type;
95106
private final String reason;
96107
private final long startedAtMillis;
@@ -110,6 +121,7 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
110121
*/
111122
private SingleNodeShutdownMetadata(
112123
String nodeId,
124+
@Nullable String nodeEphemeralId,
113125
Type type,
114126
String reason,
115127
long startedAtMillis,
@@ -119,6 +131,7 @@ private SingleNodeShutdownMetadata(
119131
@Nullable TimeValue gracePeriod
120132
) {
121133
this.nodeId = Objects.requireNonNull(nodeId, "node ID must not be null");
134+
this.nodeEphemeralId = nodeEphemeralId;
122135
this.type = Objects.requireNonNull(type, "shutdown type must not be null");
123136
this.reason = Objects.requireNonNull(reason, "shutdown reason must not be null");
124137
this.startedAtMillis = startedAtMillis;
@@ -157,6 +170,11 @@ private SingleNodeShutdownMetadata(
157170

158171
public SingleNodeShutdownMetadata(StreamInput in) throws IOException {
159172
this.nodeId = in.readString();
173+
if (in.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
174+
this.nodeEphemeralId = in.readOptionalString();
175+
} else {
176+
this.nodeEphemeralId = null; // empty when talking to old nodes, meaning the persistent node id is the only differentiator
177+
}
160178
this.type = in.readEnum(Type.class);
161179
this.reason = in.readString();
162180
this.startedAtMillis = in.readVLong();
@@ -181,6 +199,15 @@ public String getNodeId() {
181199
return nodeId;
182200
}
183201

202+
/**
203+
* @return The ephemeral ID of the node this {@link SingleNodeShutdownMetadata} concerns, or
204+
* {@code null} if the ephemeral id is unknown.
205+
*/
206+
@Nullable
207+
public String getNodeEphemeralId() {
208+
return nodeEphemeralId;
209+
}
210+
184211
/**
185212
* @return The type of shutdown this is (shutdown vs. permanent).
186213
*/
@@ -241,6 +268,9 @@ public TimeValue getGracePeriod() {
241268
@Override
242269
public void writeTo(StreamOutput out) throws IOException {
243270
out.writeString(nodeId);
271+
if (out.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
272+
out.writeOptionalString(nodeEphemeralId);
273+
}
244274
if ((out.getTransportVersion().before(REPLACE_SHUTDOWN_TYPE_ADDED_VERSION) && this.type == SingleNodeShutdownMetadata.Type.REPLACE)
245275
|| (out.getTransportVersion().before(SIGTERM_ADDED_VERSION) && this.type == Type.SIGTERM)) {
246276
out.writeEnum(SingleNodeShutdownMetadata.Type.REMOVE);
@@ -264,6 +294,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
264294
builder.startObject();
265295
{
266296
builder.field(NODE_ID_FIELD.getPreferredName(), nodeId);
297+
builder.field(NODE_EPHEMERAL_ID_FIELD.getPreferredName(), nodeEphemeralId);
267298
builder.field(TYPE_FIELD.getPreferredName(), type);
268299
builder.field(REASON_FIELD.getPreferredName(), reason);
269300
builder.timestampFieldsFromUnixEpochMillis(
@@ -295,6 +326,7 @@ public boolean equals(Object o) {
295326
return getStartedAtMillis() == that.getStartedAtMillis()
296327
&& getNodeSeen() == that.getNodeSeen()
297328
&& getNodeId().equals(that.getNodeId())
329+
&& Objects.equals(getNodeEphemeralId(), that.getNodeEphemeralId())
298330
&& getType() == that.getType()
299331
&& getReason().equals(that.getReason())
300332
&& Objects.equals(getAllocationDelay(), that.getAllocationDelay())
@@ -306,6 +338,7 @@ && getReason().equals(that.getReason())
306338
public int hashCode() {
307339
return Objects.hash(
308340
getNodeId(),
341+
getNodeEphemeralId(),
309342
getType(),
310343
getReason(),
311344
getStartedAtMillis(),
@@ -322,6 +355,8 @@ public String toString() {
322355
stringBuilder.append("{")
323356
.append("nodeId=[")
324357
.append(nodeId)
358+
.append("], nodeEphemeralId=[")
359+
.append(nodeEphemeralId)
325360
.append(']')
326361
.append(", type=[")
327362
.append(type)
@@ -350,6 +385,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {
350385
return builder();
351386
}
352387
return new Builder().setNodeId(original.getNodeId())
388+
.setNodeEphemeralId(original.getNodeEphemeralId())
353389
.setType(original.getType())
354390
.setReason(original.getReason())
355391
.setStartedAtMillis(original.getStartedAtMillis())
@@ -359,6 +395,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {
359395

360396
public static class Builder {
361397
private String nodeId;
398+
private String nodeEphemeralId;
362399
private Type type;
363400
private String reason;
364401
private long startedAtMillis = -1;
@@ -378,6 +415,15 @@ public Builder setNodeId(String nodeId) {
378415
return this;
379416
}
380417

418+
/**
419+
* @param nodeEphemeralId The node ephemeral ID this metadata refers to.
420+
* @return This builder.
421+
*/
422+
public Builder setNodeEphemeralId(String nodeEphemeralId) {
423+
this.nodeEphemeralId = nodeEphemeralId;
424+
return this;
425+
}
426+
381427
/**
382428
* @param type The type of shutdown.
383429
* @return This builder.
@@ -444,6 +490,7 @@ public SingleNodeShutdownMetadata build() {
444490

445491
return new SingleNodeShutdownMetadata(
446492
nodeId,
493+
nodeEphemeralId,
447494
type,
448495
reason,
449496
startedAtMillis,

server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public void testIsNodeShuttingDown() {
7878
"this_node",
7979
SingleNodeShutdownMetadata.builder()
8080
.setNodeId("this_node")
81+
.setNodeEphemeralId("this_node")
8182
.setReason("shutdown for a unit test")
8283
.setType(type)
8384
.setStartedAtMillis(randomNonNegativeLong())
@@ -106,6 +107,7 @@ public void testIsNodeShuttingDown() {
106107
public void testSigtermIsRemoveInOlderVersions() throws IOException {
107108
SingleNodeShutdownMetadata metadata = SingleNodeShutdownMetadata.builder()
108109
.setNodeId("myid")
110+
.setNodeEphemeralId("myid")
109111
.setType(SingleNodeShutdownMetadata.Type.SIGTERM)
110112
.setReason("myReason")
111113
.setStartedAtMillis(0L)
@@ -127,6 +129,7 @@ public void testIsNodeMarkedForRemoval() {
127129
SingleNodeShutdownMetadata.Type type;
128130
SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
129131
.setNodeId("thenode")
132+
.setNodeEphemeralId("thenode")
130133
.setReason("myReason")
131134
.setStartedAtMillis(0L);
132135
switch (type = randomFrom(SingleNodeShutdownMetadata.Type.values())) {
@@ -182,6 +185,7 @@ private SingleNodeShutdownMetadata randomNodeShutdownInfo() {
182185
final SingleNodeShutdownMetadata.Type type = randomFrom(SingleNodeShutdownMetadata.Type.values());
183186
final SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
184187
.setNodeId(randomAlphaOfLength(5))
188+
.setNodeEphemeralId(randomAlphaOfLength(5))
185189
.setType(type)
186190
.setReason(randomAlphaOfLength(5))
187191
.setStartedAtMillis(randomNonNegativeLong());

server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,7 @@ public void testRemainingDelayCalculationsWithUnrelatedShutdowns() {
638638
shutdowns = shutdowns.putSingleNodeMetadata(
639639
SingleNodeShutdownMetadata.builder()
640640
.setNodeId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
641+
.setNodeEphemeralId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
641642
.setReason(this.getTestName())
642643
.setStartedAtMillis(randomNonNegativeLong())
643644
.setType(type)
@@ -658,6 +659,7 @@ public void testRemainingDelayCalculationWhenNodeIsShuttingDownForRemoval() {
658659
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
659660
SingleNodeShutdownMetadata.builder()
660661
.setNodeId(lastNodeId)
662+
.setNodeEphemeralId(lastNodeId)
661663
.setReason(this.getTestName())
662664
.setStartedAtMillis(randomNonNegativeLong())
663665
.setType(type)
@@ -678,6 +680,7 @@ public void testRemainingDelayCalculationWhenNodeIsKnownToBeRestartingWithCustom
678680
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
679681
SingleNodeShutdownMetadata.builder()
680682
.setNodeId(lastNodeId)
683+
.setNodeEphemeralId(lastNodeId)
681684
.setReason(this.getTestName())
682685
.setStartedAtMillis(randomNonNegativeLong())
683686
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@@ -700,6 +703,7 @@ public void testRemainingDelayCalculationWhenNodeIsKnownToBeRestartingWithDefaul
700703
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
701704
SingleNodeShutdownMetadata.builder()
702705
.setNodeId(lastNodeId)
706+
.setNodeEphemeralId(lastNodeId)
703707
.setReason(this.getTestName())
704708
.setStartedAtMillis(randomNonNegativeLong())
705709
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@@ -728,6 +732,7 @@ public void testRemainingDelayUsesIndexLevelDelayIfNodeWasNotRestartingWhenShard
728732
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
729733
SingleNodeShutdownMetadata.builder()
730734
.setNodeId(lastNodeId)
735+
.setNodeEphemeralId(lastNodeId)
731736
.setReason(this.getTestName())
732737
.setStartedAtMillis(randomNonNegativeLong())
733738
.setType(SingleNodeShutdownMetadata.Type.RESTART)

server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,7 @@ protected void updateIndicesReadOnly(Set<String> indicesToUpdate, Releasable onC
909909
sourceNode,
910910
SingleNodeShutdownMetadata.builder()
911911
.setNodeId(sourceNode)
912+
.setNodeEphemeralId(sourceNode)
912913
.setReason("testing")
913914
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
914915
.setTargetNodeName(targetNode)
@@ -1293,6 +1294,7 @@ public void testSkipDiskThresholdMonitorWhenStateNotRecovered() {
12931294
"node1",
12941295
SingleNodeShutdownMetadata.builder()
12951296
.setNodeId("node1")
1297+
.setNodeEphemeralId("node1")
12961298
.setReason("testing")
12971299
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
12981300
.setTargetNodeName("node3")

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocat
923923
"node-0",
924924
SingleNodeShutdownMetadata.builder()
925925
.setNodeId("node-0")
926+
.setNodeEphemeralId("node-0")
926927
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
927928
.setTargetNodeName("node-2")
928929
.setStartedAtMillis(System.currentTimeMillis())

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,7 @@ public void resetDesiredBalance() {
857857
final var shutdownType = randomFrom(Type.SIGTERM, Type.REMOVE, Type.REPLACE);
858858
final var singleShutdownMetadataBuilder = SingleNodeShutdownMetadata.builder()
859859
.setNodeId(node2.getId())
860+
.setNodeEphemeralId(node2.getEphemeralId())
860861
.setReason("test")
861862
.setType(shutdownType)
862863
.setStartedAtMillis(randomNonNegativeLong());

server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDeciderTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ private NodesShutdownMetadata createNodeShutdownReplacementMetadata(String sourc
420420
return new NodesShutdownMetadata(new HashMap<>()).putSingleNodeMetadata(
421421
SingleNodeShutdownMetadata.builder()
422422
.setNodeId(sourceNodeId)
423+
.setNodeEphemeralId(sourceNodeId)
423424
.setTargetNodeName(targetNodeName)
424425
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
425426
.setReason(this.getTestName())

0 commit comments

Comments
 (0)