CNDB-poc-reader: customize replica plan iterator for topk

jasonstack · jasonstack · commit a9f93dc3cc4e · 2025-05-07T13:26:23.000+08:00
diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
@@ -321,6 +321,8 @@ public enum CassandraRelevantProperties
     /** Which class to use for token metadata provider */
     CUSTOM_TMD_PROVIDER_PROPERTY("cassandra.custom_token_metadata_provider_class"),
 
+    CUSTOM_REPLICA_PLAN_ITERATOR_PROVIDER_PROPERTY("cassandra.custom_replica_plan_iterator_provider_class"),
+
     /** Which class to use for failure detection */
     CUSTOM_FAILURE_DETECTOR_PROPERTY("cassandra.custom_failure_detector_class"),
 
diff --git a/src/java/org/apache/cassandra/locator/ReplicaLayout.java b/src/java/org/apache/cassandra/locator/ReplicaLayout.java
@@ -341,7 +341,7 @@ static ReplicaLayout.ForTokenRead forTokenReadLiveSorted(AbstractReplicationStra
      * @return the read layout for a range - this includes only live natural replicas, i.e. those that are not pending
      * and not marked down by the failure detector. these are reverse sorted by the badness score of the configured snitch
      */
-    static ReplicaLayout.ForRangeRead forRangeReadLiveSorted(AbstractReplicationStrategy replicationStrategy, AbstractBounds<PartitionPosition> range)
+    public static ReplicaLayout.ForRangeRead forRangeReadLiveSorted(AbstractReplicationStrategy replicationStrategy, AbstractBounds<PartitionPosition> range)
     {
         EndpointsForRange replicas = replicationStrategy.getNaturalReplicas(range.right);
         replicas = DatabaseDescriptor.getEndpointSnitch().sortedByProximity(FBUtilities.getBroadcastAddressAndPort(), replicas);
diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlans.java b/src/java/org/apache/cassandra/locator/ReplicaPlans.java
@@ -138,7 +138,7 @@ public static boolean isSufficientLiveReplicasForRead(AbstractReplicationStrateg
         }
     }
 
-    static void assureSufficientLiveReplicasForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, Endpoints<?> liveReplicas) throws UnavailableException
+    public static void assureSufficientLiveReplicasForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, Endpoints<?> liveReplicas) throws UnavailableException
     {
         assureSufficientLiveReplicas(replicationStrategy, consistencyLevel, liveReplicas, consistencyLevel.blockFor(replicationStrategy), 1);
     }
@@ -659,7 +659,7 @@ public static ReplicaPlan.ForPaxosWrite forPaxos(Keyspace keyspace, DecoratedKey
     }
 
 
-    private static <E extends Endpoints<E>> E candidatesForRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, E liveNaturalReplicas)
+    public static <E extends Endpoints<E>> E candidatesForRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, E liveNaturalReplicas)
     {
         E replicas = consistencyLevel.isDatacenterLocal() ? liveNaturalReplicas.filter(InOurDcTester.replicas()) : liveNaturalReplicas;
 
@@ -677,7 +677,7 @@ private static <E extends Endpoints<E>> E contactForEachQuorumRead(NetworkTopolo
         });
     }
 
-    private static <E extends Endpoints<E>> E contactForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, boolean alwaysSpeculate, E candidates)
+    public static <E extends Endpoints<E>> E contactForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, boolean alwaysSpeculate, E candidates)
     {
         /*
          * If we are doing an each quorum query, we have to make sure that the endpoints we select
diff --git a/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java b/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java
@@ -35,6 +35,7 @@
 import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.schema.UserFunctions.FunctionsDiff;
 import org.apache.cassandra.schema.Tables.TablesDiff;
 import org.apache.cassandra.schema.Types.TypesDiff;
@@ -389,10 +390,15 @@ public void validate()
     }
 
     public AbstractReplicationStrategy createReplicationStrategy()
+    {
+        return createReplicationStrategy(StorageService.instance.getTokenMetadataForKeyspace(name));
+    }
+
+    public AbstractReplicationStrategy createReplicationStrategy(TokenMetadata tokenMetadata)
     {
         return AbstractReplicationStrategy.createReplicationStrategy(name,
                                                                      params.replication.klass,
-                                                                     StorageService.instance.getTokenMetadataForKeyspace(name),
+                                                                     tokenMetadata,
                                                                      DatabaseDescriptor.getEndpointSnitch(),
                                                                      params.replication.options);
     }
diff --git a/src/java/org/apache/cassandra/service/reads/range/AbstractReplicaPlanIterator.java b/src/java/org/apache/cassandra/service/reads/range/AbstractReplicaPlanIterator.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.Pair;
+
+public abstract class AbstractReplicaPlanIterator extends AbstractIterator<ReplicaPlan.ForRangeRead>
+{
+    public abstract int size();
+
+    /**
+     * Compute all ranges we're going to query, in sorted order. Nodes can be replica destinations for many ranges,
+     * so we need to restrict each scan to the specific range we want, or else we'd get duplicate results.
+     */
+    public static List<AbstractBounds<PartitionPosition>> getRestrictedRanges(TokenMetadata tokenMetadata, final AbstractBounds<PartitionPosition> queryRange)
+    {
+        // special case for bounds containing exactly 1 (non-minimum) token
+        if (queryRange instanceof Bounds && queryRange.left.equals(queryRange.right) && !queryRange.left.isMinimum())
+        {
+            return Collections.singletonList(queryRange);
+        }
+
+        List<AbstractBounds<PartitionPosition>> ranges = new ArrayList<>();
+        // divide the queryRange into pieces delimited by the ring and minimum tokens
+        Iterator<Token> ringIter = TokenMetadata.ringIterator(tokenMetadata.sortedTokens(), queryRange.left.getToken(), true);
+        AbstractBounds<PartitionPosition> remainder = queryRange;
+        while (ringIter.hasNext())
+        {
+            /*
+             * remainder is a range/bounds of partition positions and we want to split it with a token. We want to split
+             * using the key returned by token.maxKeyBound. For instance, if remainder is [DK(10, 'foo'), DK(20, 'bar')],
+             * and we have 3 nodes with tokens 0, 15, 30, we want to split remainder to A=[DK(10, 'foo'), 15] and
+             * B=(15, DK(20, 'bar')]. But since we can't mix tokens and keys at the same time in a range, we use
+             * 15.maxKeyBound() to have A include all keys having 15 as token and B include none of those (since that is
+             * what our node owns).
+             */
+            Token upperBoundToken = ringIter.next();
+            PartitionPosition upperBound = upperBoundToken.maxKeyBound();
+            if (!remainder.left.equals(upperBound) && !remainder.contains(upperBound))
+                // no more splits
+                break;
+            Pair<AbstractBounds<PartitionPosition>, AbstractBounds<PartitionPosition>> splits = remainder.split(upperBound);
+            if (splits == null)
+                continue;
+
+            ranges.add(splits.left);
+            remainder = splits.right;
+        }
+        ranges.add(remainder);
+
+        return ranges;
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
@@ -73,10 +73,7 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma
         Tracing.trace("Computing ranges to query");
 
         Keyspace keyspace = Keyspace.open(command.metadata().keyspace);
-        ReplicaPlanIterator replicaPlans = new ReplicaPlanIterator(command.dataRange().keyRange(),
-                                                                   command.indexQueryPlan(),
-                                                                   keyspace,
-                                                                   consistencyLevel);
+        AbstractReplicaPlanIterator replicaPlans = ReplicaPlanIteratorProvider.instance.getReplicaPlanIterator(command, keyspace, consistencyLevel);
         if (command.isTopK())
             return new ScanAllRangesCommandIterator(keyspace, replicaPlans, command, replicaPlans.size(), queryStartNanoTime, readTracker);
 
diff --git a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java
@@ -36,11 +36,9 @@
 import org.apache.cassandra.locator.ReplicaPlan;
 import org.apache.cassandra.locator.ReplicaPlans;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.Pair;
 
-class ReplicaPlanIterator extends AbstractIterator<ReplicaPlan.ForRangeRead>
+class ReplicaPlanIterator extends AbstractReplicaPlanIterator
 {
     private final Keyspace keyspace;
     private final ConsistencyLevel consistency;
@@ -69,7 +67,8 @@ class ReplicaPlanIterator extends AbstractIterator<ReplicaPlan.ForRangeRead>
     /**
      * @return the number of {@link ReplicaPlan.ForRangeRead}s in this iterator
      */
-    int size()
+    @Override
+    public int size()
     {
         return rangeCount;
     }
@@ -82,47 +81,4 @@ protected ReplicaPlan.ForRangeRead computeNext()
 
         return ReplicaPlans.forRangeRead(keyspace, indexQueryPlan, consistency, ranges.next(), 1);
     }
-
-    /**
-     * Compute all ranges we're going to query, in sorted order. Nodes can be replica destinations for many ranges,
-     * so we need to restrict each scan to the specific range we want, or else we'd get duplicate results.
-     */
-    private static List<AbstractBounds<PartitionPosition>> getRestrictedRanges(TokenMetadata tokenMetadata, final AbstractBounds<PartitionPosition> queryRange)
-    {
-        // special case for bounds containing exactly 1 (non-minimum) token
-        if (queryRange instanceof Bounds && queryRange.left.equals(queryRange.right) && !queryRange.left.isMinimum())
-        {
-            return Collections.singletonList(queryRange);
-        }
-
-        List<AbstractBounds<PartitionPosition>> ranges = new ArrayList<>();
-        // divide the queryRange into pieces delimited by the ring and minimum tokens
-        Iterator<Token> ringIter = TokenMetadata.ringIterator(tokenMetadata.sortedTokens(), queryRange.left.getToken(), true);
-        AbstractBounds<PartitionPosition> remainder = queryRange;
-        while (ringIter.hasNext())
-        {
-            /*
-             * remainder is a range/bounds of partition positions and we want to split it with a token. We want to split
-             * using the key returned by token.maxKeyBound. For instance, if remainder is [DK(10, 'foo'), DK(20, 'bar')],
-             * and we have 3 nodes with tokens 0, 15, 30, we want to split remainder to A=[DK(10, 'foo'), 15] and
-             * B=(15, DK(20, 'bar')]. But since we can't mix tokens and keys at the same time in a range, we use
-             * 15.maxKeyBound() to have A include all keys having 15 as token and B include none of those (since that is
-             * what our node owns).
-             */
-            Token upperBoundToken = ringIter.next();
-            PartitionPosition upperBound = upperBoundToken.maxKeyBound();
-            if (!remainder.left.equals(upperBound) && !remainder.contains(upperBound))
-                // no more splits
-                break;
-            Pair<AbstractBounds<PartitionPosition>, AbstractBounds<PartitionPosition>> splits = remainder.split(upperBound);
-            if (splits == null)
-                continue;
-
-            ranges.add(splits.left);
-            remainder = splits.right;
-        }
-        ranges.add(remainder);
-
-        return ranges;
-    }
 }
diff --git a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorProvider.java b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorProvider.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.config.CassandraRelevantProperties.CUSTOM_REPLICA_PLAN_ITERATOR_PROVIDER_PROPERTY;
+
+public interface ReplicaPlanIteratorProvider
+{
+    ReplicaPlanIteratorProvider instance = CUSTOM_REPLICA_PLAN_ITERATOR_PROVIDER_PROPERTY.isPresent()
+                                     ? FBUtilities.construct(CUSTOM_REPLICA_PLAN_ITERATOR_PROVIDER_PROPERTY.getString(),
+                                                             "Replica Plan Iterator Provider")
+                                     : new DefaultReplicaPlanIteratorProvider();
+
+    AbstractReplicaPlanIterator getReplicaPlanIterator(PartitionRangeReadCommand command, Keyspace keyspace, ConsistencyLevel consistencyLevel);
+
+    class DefaultReplicaPlanIteratorProvider implements ReplicaPlanIteratorProvider
+    {
+        @Override
+        public AbstractReplicaPlanIterator getReplicaPlanIterator(PartitionRangeReadCommand command, Keyspace keyspace, ConsistencyLevel consistencyLevel)
+        {
+            return new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, consistencyLevel);
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -341,7 +341,7 @@ static ReplicaLayout.ForTokenRead forTokenReadLiveSorted(AbstractReplicationStra`
`341`	`341`	`* @return the read layout for a range - this includes only live natural replicas, i.e. those that are not pending`
`342`	`342`	`* and not marked down by the failure detector. these are reverse sorted by the badness score of the configured snitch`
`343`	`343`	`*/`
`344`		`- static ReplicaLayout.ForRangeRead forRangeReadLiveSorted(AbstractReplicationStrategy replicationStrategy, AbstractBounds<PartitionPosition> range)`
	`344`	`+ public static ReplicaLayout.ForRangeRead forRangeReadLiveSorted(AbstractReplicationStrategy replicationStrategy, AbstractBounds<PartitionPosition> range)`
`345`	`345`	`{`
`346`	`346`	`EndpointsForRange replicas = replicationStrategy.getNaturalReplicas(range.right);`
`347`	`347`	`replicas = DatabaseDescriptor.getEndpointSnitch().sortedByProximity(FBUtilities.getBroadcastAddressAndPort(), replicas);`
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ public static boolean isSufficientLiveReplicasForRead(AbstractReplicationStrateg`
`138`	`138`	`}`
`139`	`139`	`}`
`140`	`140`
`141`		`- static void assureSufficientLiveReplicasForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, Endpoints<?> liveReplicas) throws UnavailableException`
	`141`	`+ public static void assureSufficientLiveReplicasForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, Endpoints<?> liveReplicas) throws UnavailableException`
`142`	`142`	`{`
`143`	`143`	`assureSufficientLiveReplicas(replicationStrategy, consistencyLevel, liveReplicas, consistencyLevel.blockFor(replicationStrategy), 1);`
`144`	`144`	`}`
`@@ -659,7 +659,7 @@ public static ReplicaPlan.ForPaxosWrite forPaxos(Keyspace keyspace, DecoratedKey`
`659`	`659`	`}`
`660`	`660`
`661`	`661`
`662`		`- private static <E extends Endpoints<E>> E candidatesForRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, E liveNaturalReplicas)`
	`662`	`+ public static <E extends Endpoints<E>> E candidatesForRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, E liveNaturalReplicas)`
`663`	`663`	`{`
`664`	`664`	`E replicas = consistencyLevel.isDatacenterLocal() ? liveNaturalReplicas.filter(InOurDcTester.replicas()) : liveNaturalReplicas;`
`665`	`665`
`@@ -677,7 +677,7 @@ private static <E extends Endpoints<E>> E contactForEachQuorumRead(NetworkTopolo`
`677`	`677`	`});`
`678`	`678`	`}`
`679`	`679`
`680`		`- private static <E extends Endpoints<E>> E contactForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, boolean alwaysSpeculate, E candidates)`
	`680`	`+ public static <E extends Endpoints<E>> E contactForRead(AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, boolean alwaysSpeculate, E candidates)`
`681`	`681`	`{`
`682`	`682`	`/*`
`683`	`683`	`* If we are doing an each quorum query, we have to make sure that the endpoints we select`