Skip to content

Commit 496959a

Browse files
committed
Don't update registration status if node state for decommissioned peer is found with the same address
Patch by Sam Tunnicliffe; reviewed by Marcus Eriksson for CASSANDRA-21005
1 parent ec7794f commit 496959a

File tree

4 files changed

+68
-1
lines changed

4 files changed

+68
-1
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* Don't update registration status if node state for decommissioned peer is found with the same address (CASSANDRA-21005)
23
* Avoid NPE when meta keyspace placements are empty before CMS is initialized (CASSANDRA-21004)
34
* Gossip entries for hibernating non-members don't block truncate (CASSANDRA-21003)
45
* Retry without time limit calculates wait time incorrectly (CASSANDRA-21002)

src/java/org/apache/cassandra/service/CassandraDaemon.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
import org.apache.cassandra.streaming.StreamManager;
9191
import org.apache.cassandra.tcm.ClusterMetadata;
9292
import org.apache.cassandra.tcm.MultiStepOperation;
93+
import org.apache.cassandra.tcm.membership.NodeState;
9394
import org.apache.cassandra.utils.FBUtilities;
9495
import org.apache.cassandra.utils.JMXServerUtils;
9596
import org.apache.cassandra.utils.JVMStabilityInspector;
@@ -278,7 +279,12 @@ protected void setup()
278279
disableAutoCompaction(Schema.instance.distributedKeyspaces().names());
279280
CMSOperations.initJmx();
280281
AccordOperations.initJmx();
281-
if (ClusterMetadata.current().myNodeId() != null)
282+
NodeState nodeStateForLocalAddress = ClusterMetadata.current().myNodeState();
283+
// If another node with the same address was previously a member and was decommissioned, it can be
284+
// present in ClusterMetadata with a LEFT state. That should not trigger _this_ node to update
285+
// RegistrationStatus. During the startup process the old node will be expunged and this node
286+
// will register, prompting another call to onRegistration.
287+
if (nodeStateForLocalAddress != null && nodeStateForLocalAddress != NodeState.LEFT)
282288
RegistrationStatus.instance.onRegistration();
283289
}
284290
catch (InterruptedException | ExecutionException | IOException e)

test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,17 @@ public I bootstrap(IInstanceConfig config, Versions.Version version)
697697
return instance;
698698
}
699699

700+
public synchronized void unsafeRemoveNode(I toRemove)
701+
{
702+
instances.remove(toRemove);
703+
instanceMap.remove(toRemove.broadcastAddress(), toRemove);
704+
}
705+
706+
public void unsafeUpdateNodeIdTopology(int num, NetworkTopology.DcAndRack location)
707+
{
708+
nodeIdTopology.put(num, location);
709+
}
710+
700711
/**
701712
* WARNING: we index from 1 here, for consistency with inet address!
702713
*/

test/distributed/org/apache/cassandra/distributed/test/ring/DecommissionTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,19 @@
3434
import net.bytebuddy.dynamic.loading.ClassLoadingStrategy;
3535
import net.bytebuddy.implementation.MethodDelegation;
3636
import net.bytebuddy.implementation.bind.annotation.SuperCall;
37+
import org.apache.cassandra.config.DatabaseDescriptor;
3738
import org.apache.cassandra.cql3.QueryProcessor;
3839
import org.apache.cassandra.distributed.Cluster;
40+
import org.apache.cassandra.distributed.Constants;
3941
import org.apache.cassandra.distributed.api.ConsistencyLevel;
42+
import org.apache.cassandra.distributed.api.IInstanceConfig;
43+
import org.apache.cassandra.distributed.api.IInvokableInstance;
4044
import org.apache.cassandra.distributed.api.NodeToolResult;
45+
import org.apache.cassandra.distributed.api.TokenSupplier;
4146
import org.apache.cassandra.distributed.shared.ClusterUtils;
4247
import org.apache.cassandra.distributed.test.TestBaseImpl;
4348
import org.apache.cassandra.gms.Gossiper;
49+
import org.apache.cassandra.io.util.File;
4450
import org.apache.cassandra.service.StorageService;
4551
import org.apache.cassandra.streaming.StreamSession;
4652
import org.apache.cassandra.tcm.ClusterMetadata;
@@ -55,6 +61,8 @@
5561
import static net.bytebuddy.matcher.ElementMatchers.named;
5662
import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
5763
import static org.apache.cassandra.distributed.api.Feature.NETWORK;
64+
import static org.apache.cassandra.distributed.shared.NetworkTopology.dcAndRack;
65+
import static org.apache.cassandra.distributed.shared.NetworkTopology.networkTopology;
5866
import static org.apache.cassandra.distributed.test.ring.BootstrapTest.populate;
5967
import static org.junit.Assert.assertEquals;
6068
import static org.junit.Assert.assertTrue;
@@ -75,6 +83,47 @@ public void testResumableDecom() throws IOException
7583
}
7684
}
7785

86+
@Test
87+
public void testAddressReuseAfterDecommission() throws IOException, ExecutionException, InterruptedException
88+
{
89+
// Initially, all nodes should be in dc1/rack1. Node 3 will be decommissioned and a new node added re-using
90+
// node 3's address. When the new node registers, it should be in dc2/rack2.
91+
// For now, this requires the accord service to disabled. See CASSANDRA-21026
92+
try (Cluster cluster = builder().withNodes(3)
93+
.withTokenSupplier(TokenSupplier.evenlyDistributedTokens(4))
94+
.withConfig(config -> config.with(NETWORK, GOSSIP)
95+
.set("accord.enabled", false))
96+
.withNodeIdTopology(networkTopology(3, (id) -> dcAndRack("dc1", "rack1")))
97+
.start())
98+
{
99+
assertEquals("dc1/rack1", cluster.get(1).callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
100+
assertEquals("dc1/rack1", cluster.get(2).callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
101+
assertEquals("dc1/rack1", cluster.get(3).callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
102+
103+
IInvokableInstance toRemove = cluster.get(3);
104+
toRemove.nodetoolResult("decommission", "--force").asserts().success();
105+
toRemove.shutdown().get();
106+
ClusterUtils.getDirectories(toRemove).forEach(File::tryDeleteRecursive);
107+
cluster.unsafeRemoveNode(toRemove);
108+
109+
// Now add a new node, using the same address as the one we just removed. This new node should register
110+
// itself in dc2/rack2 and not inherit the location of its predecessor.
111+
// Note: because we have removed the original node3 from the cluster completely, which is necessary because
112+
// the cluster will complain about an id clash otherwise, this new node will also be "node3". However, it is
113+
// completely distinct from the original one.
114+
cluster.unsafeUpdateNodeIdTopology(toRemove.config().num(), dcAndRack("dc2", "rack2"));
115+
IInstanceConfig config = cluster.newInstanceConfig()
116+
.set("auto_bootstrap", true)
117+
.set(Constants.KEY_DTEST_FULL_STARTUP, true);
118+
IInvokableInstance newInstance = cluster.bootstrap(config);
119+
newInstance.startup();
120+
121+
assertEquals("dc1/rack1", cluster.get(1).callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
122+
assertEquals("dc1/rack1", cluster.get(2).callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
123+
assertEquals("dc2/rack2", newInstance.callOnInstance(() -> DatabaseDescriptor.getLocator().local().toString()));
124+
}
125+
}
126+
78127
public static class BB
79128
{
80129
static void install(ClassLoader cl, int nodeNumber)

0 commit comments

Comments
 (0)