Skip to content

Commit c343bb8

Browse files
committed
mutable remote vtables
1 parent 22a2503 commit c343bb8

File tree

9 files changed

+370
-18
lines changed

9 files changed

+370
-18
lines changed

src/java/org/apache/cassandra/db/virtual/RemoteToLocalVirtualTable.java

Lines changed: 166 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.ArrayList;
2424
import java.util.Arrays;
2525
import java.util.Collections;
26+
import java.util.Iterator;
2627
import java.util.List;
2728
import java.util.NavigableSet;
2829
import java.util.function.Function;
@@ -32,15 +33,19 @@
3233
import org.apache.cassandra.db.BufferClusteringBound;
3334
import org.apache.cassandra.db.Clustering;
3435
import org.apache.cassandra.db.ClusteringBound;
36+
import org.apache.cassandra.db.ClusteringPrefix;
3537
import org.apache.cassandra.db.DataRange;
3638
import org.apache.cassandra.db.DecoratedKey;
39+
import org.apache.cassandra.db.DeletionInfo;
3740
import org.apache.cassandra.db.PartitionPosition;
3841
import org.apache.cassandra.db.PartitionRangeReadCommand;
42+
import org.apache.cassandra.db.RangeTombstone;
3943
import org.apache.cassandra.db.ReadCommand;
4044
import org.apache.cassandra.db.ReadResponse;
4145
import org.apache.cassandra.db.SinglePartitionReadCommand;
4246
import org.apache.cassandra.db.Slice;
4347
import org.apache.cassandra.db.Slices;
48+
import org.apache.cassandra.db.TruncateRequest;
4449
import org.apache.cassandra.db.filter.ClusteringIndexFilter;
4550
import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
4651
import org.apache.cassandra.db.filter.ColumnFilter;
@@ -50,8 +55,11 @@
5055
import org.apache.cassandra.db.marshal.ByteBufferAccessor;
5156
import org.apache.cassandra.db.marshal.CompositeType;
5257
import org.apache.cassandra.db.marshal.Int32Type;
58+
import org.apache.cassandra.db.partitions.PartitionUpdate;
5359
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
60+
import org.apache.cassandra.db.rows.BTreeRow;
5461
import org.apache.cassandra.db.rows.Cell;
62+
import org.apache.cassandra.db.rows.ColumnData;
5563
import org.apache.cassandra.db.rows.Row;
5664
import org.apache.cassandra.db.rows.Unfiltered;
5765
import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -68,6 +76,9 @@
6876
import org.apache.cassandra.schema.TableMetadata;
6977
import org.apache.cassandra.tcm.ClusterMetadata;
7078
import org.apache.cassandra.tcm.membership.NodeId;
79+
import org.apache.cassandra.utils.btree.BTree;
80+
import org.apache.cassandra.utils.concurrent.AsyncPromise;
81+
import org.apache.cassandra.utils.concurrent.Promise;
7182
import org.apache.cassandra.utils.concurrent.SyncPromise;
7283

7384
import static org.apache.cassandra.db.ClusteringBound.BOTTOM;
@@ -228,16 +239,32 @@ private Request(DataRange dataRange, RowFilter rowFilter, ColumnFilter columnFil
228239

229240
private void send(RequestAndResponse rr, InetAddressAndPort endpoint)
230241
{
231-
MessagingService.instance().sendWithCallback(Message.out(Verb.READ_REQ, rr.readCommand), endpoint, new RequestCallback<ReadResponse>()
242+
send(Verb.READ_REQ, rr.readCommand, rr, endpoint);
243+
}
244+
245+
private <Reply> Promise<Reply> send(Verb verb, Object payload, InetAddressAndPort endpoint)
246+
{
247+
Promise<Reply> promise = new AsyncPromise<>();
248+
send(verb, payload, promise, endpoint);
249+
return promise;
250+
}
251+
252+
private <Reply> void send(Verb verb, Object payload, Promise<Reply> promise, InetAddressAndPort endpoint)
253+
{
254+
// we have to send inline some of the MessagingService logic to circumvent the requirement to use AbstractWriteResponseHandler
255+
Message<?> message = Message.out(verb, payload);
256+
RequestCallback<?> callback = new RequestCallback<Reply>()
232257
{
233-
@Override public void onResponse(Message<ReadResponse> msg) { rr.trySuccess(msg.payload); }
258+
@Override public void onResponse(Message<Reply> msg) { promise.trySuccess(msg.payload); }
234259
@Override public boolean invokeOnFailure() { return true; }
235260
@Override public void onFailure(InetAddressAndPort from, RequestFailure failure)
236261
{
237-
if (failure.failure == null) rr.tryFailure(new RuntimeException(failure.reason.toString()));
238-
else rr.tryFailure(failure.failure);
262+
if (failure.failure == null) promise.tryFailure(new RuntimeException(failure.reason.toString()));
263+
else promise.tryFailure(failure.failure);
239264
}
240-
});
265+
};
266+
267+
MessagingService.instance().sendWithCallback(message, endpoint, callback);
241268
}
242269

243270
private void collect(PartitionsCollector collector, RequestAndResponse rr, Function<DecoratedKey, ByteBuffer[]> pksToCks)
@@ -280,7 +307,6 @@ private void collect(PartitionsCollector collector, RequestAndResponse rr, Funct
280307
}
281308
}
282309
}
283-
284310
}
285311

286312
private static boolean selectsOneRow(TableMetadata metadata, DataRange dataRange, DecoratedKey key)
@@ -299,9 +325,9 @@ private static boolean selectsOneRow(TableMetadata metadata, DataRange dataRange
299325
return slice.start().equals(slice.end());
300326
}
301327

302-
private static Function<DecoratedKey, ByteBuffer[]> partitionKeyToClusterings(TableMetadata metadata, TableMetadata local)
328+
private static Function<DecoratedKey, ByteBuffer[]> partitionKeyToClusterings(TableMetadata distributed, TableMetadata local)
303329
{
304-
ByteBuffer[] cks = new ByteBuffer[metadata.clusteringColumns().size()];
330+
ByteBuffer[] cks = new ByteBuffer[distributed.clusteringColumns().size()];
305331
if (local.partitionKeyColumns().size() == 1)
306332
{
307333
return pk -> {
@@ -448,4 +474,136 @@ private static ClusteringIndexSliceFilter filter(TableMetadata metadata, Cluster
448474
{
449475
return new ClusteringIndexSliceFilter(Slices.with(metadata.comparator, Slice.make(start, end)), reversed);
450476
}
477+
478+
@Override
479+
public void apply(PartitionUpdate update)
480+
{
481+
int nodeId = Int32Type.instance.compose(update.partitionKey().getKey());
482+
InetAddressAndPort endpoint = ClusterMetadata.current().directory.endpoint(new NodeId(nodeId));
483+
if (endpoint == null)
484+
throw new InvalidRequestException("Unknown node " + nodeId);
485+
486+
DeletionInfo deletionInfo = update.deletionInfo();
487+
if (!deletionInfo.getPartitionDeletion().isLive())
488+
{
489+
truncate(endpoint).syncThrowUncheckedOnInterrupt();
490+
return;
491+
}
492+
493+
int pkCount = local.partitionKeyColumns().size();
494+
ByteBuffer[] pkBuffer, ckBuffer;
495+
{
496+
int ckCount = local.clusteringColumns().size();
497+
pkBuffer = pkCount == 1 ? null : new ByteBuffer[pkCount];
498+
ckBuffer = new ByteBuffer[ckCount];
499+
}
500+
501+
PartitionUpdate.Builder builder = null;
502+
ArrayDeque<Promise<Void>> results = new ArrayDeque<>();
503+
504+
if (deletionInfo.hasRanges())
505+
{
506+
Iterator<RangeTombstone> iterator = deletionInfo.rangeIterator(false);
507+
while (iterator.hasNext())
508+
{
509+
RangeTombstone rt = iterator.next();
510+
ClusteringBound start = rt.deletedSlice().start();
511+
ClusteringBound end = rt.deletedSlice().end();
512+
if (start.size() < pkCount || end.size() < pkCount)
513+
throw new InvalidRequestException("Range deletions must specify a complete partition key in the underlying table " + metadata);
514+
515+
for (int i = 0 ; i < pkCount ; ++i)
516+
{
517+
if (0 != start.accessor().compare(start.get(i), end.get(i), end.accessor()))
518+
throw new InvalidRequestException("Range deletions must specify a single partition key in the underlying table " + metadata);
519+
}
520+
521+
DecoratedKey key = remoteClusteringToLocalPartitionKey(local, start, pkCount, pkBuffer);
522+
builder = maybeRolloverAndWait(key, builder, results, endpoint);
523+
if (start.size() == pkCount && end.size() == pkCount)
524+
{
525+
builder.addPartitionDeletion(rt.deletionTime());
526+
}
527+
else
528+
{
529+
start = ClusteringBound.create(start.kind(), Clustering.make(remoteClusteringToLocalClustering(start.clustering(), pkCount, ckBuffer)));
530+
end = ClusteringBound.create(end.kind(), Clustering.make(remoteClusteringToLocalClustering(end.clustering(), pkCount, ckBuffer)));
531+
builder.add(new RangeTombstone(Slice.make(start, end), rt.deletionTime()));
532+
}
533+
}
534+
}
535+
536+
if (!update.staticRow().isEmpty())
537+
throw new InvalidRequestException("Static rows are not supported for remote table " + metadata);
538+
539+
try (BTree.FastBuilder<ColumnData> columns = BTree.fastBuilder())
540+
{
541+
for (Row row : update)
542+
{
543+
Clustering<?> clustering = row.clustering();
544+
DecoratedKey key = remoteClusteringToLocalPartitionKey(local, clustering, pkCount, pkBuffer);
545+
builder = maybeRolloverAndWait(key, builder, results, endpoint);
546+
Clustering newClustering = Clustering.make(remoteClusteringToLocalClustering(clustering, pkCount, ckBuffer));
547+
columns.reset();
548+
for (ColumnData cd : row)
549+
columns.add(rebind(local, cd));
550+
builder.add(BTreeRow.create(newClustering, row.primaryKeyLivenessInfo(), row.deletion(), columns.build()));
551+
}
552+
}
553+
554+
if (builder != null)
555+
results.add(send(Verb.VIRTUAL_MUTATION_REQ, new VirtualMutation(builder.build()), endpoint));
556+
557+
while (!results.isEmpty())
558+
results.pollFirst().syncThrowUncheckedOnInterrupt();
559+
}
560+
561+
private PartitionUpdate.Builder maybeRolloverAndWait(DecoratedKey key, PartitionUpdate.Builder builder, ArrayDeque<Promise<Void>> waiting, InetAddressAndPort endpoint)
562+
{
563+
if (builder == null || !builder.partitionKey().equals(key))
564+
{
565+
if (builder != null)
566+
waiting.add(send(Verb.VIRTUAL_MUTATION_REQ, new VirtualMutation(builder.build()), endpoint));
567+
builder = new PartitionUpdate.Builder(local, key, local.regularAndStaticColumns(), 8);
568+
while (waiting.size() >= MAX_CONCURRENCY)
569+
waiting.pollFirst().syncThrowUncheckedOnInterrupt();
570+
}
571+
return builder;
572+
}
573+
574+
private Promise<Void> truncate(InetAddressAndPort endpoint)
575+
{
576+
return send(Verb.TRUNCATE_REQ, new TruncateRequest(local.keyspace, local.name), endpoint);
577+
}
578+
579+
private static ColumnData rebind(TableMetadata local, ColumnData cd)
580+
{
581+
ColumnMetadata column = local.getColumn(cd.column().name);
582+
583+
Invariants.require(column != null, cd.column() + " not found in " + local);
584+
Invariants.require(!column.isComplex(), "Complex column " + column + " not supported; should have been removed from metadata");
585+
586+
return ((Cell<?>) cd).withUpdatedColumn(column);
587+
}
588+
589+
private static DecoratedKey remoteClusteringToLocalPartitionKey(TableMetadata local, ClusteringPrefix clustering, int pkCount, ByteBuffer[] pkBuffer)
590+
{
591+
ByteBuffer bytes;
592+
if (pkCount == 1) bytes = clustering.bufferAt(0);
593+
else
594+
{
595+
for (int i = 0 ; i < pkBuffer.length ; ++i)
596+
pkBuffer[i] = clustering.bufferAt(i);
597+
bytes = CompositeType.build(ByteBufferAccessor.instance, pkBuffer);
598+
}
599+
return local.partitioner.decorateKey(bytes);
600+
}
601+
602+
private static ByteBuffer[] remoteClusteringToLocalClustering(ClusteringPrefix clustering, int pkCount, ByteBuffer[] ckBuffer)
603+
{
604+
for (int i = pkCount ; i < clustering.size(); ++i)
605+
ckBuffer[i - pkCount] = clustering.bufferAt(i);
606+
607+
return Arrays.copyOf(ckBuffer, clustering.size() - pkCount);
608+
}
451609
}

src/java/org/apache/cassandra/db/virtual/VirtualKeyspaceRegistry.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@
2424
import com.google.common.collect.Iterables;
2525

2626
import org.apache.cassandra.schema.KeyspaceMetadata;
27+
import org.apache.cassandra.schema.Schema;
2728
import org.apache.cassandra.schema.TableId;
2829
import org.apache.cassandra.schema.TableMetadata;
30+
import org.apache.cassandra.tcm.ClusterMetadata;
31+
import org.apache.cassandra.tcm.ClusterMetadataService;
2932

3033
public final class VirtualKeyspaceRegistry
3134
{

src/java/org/apache/cassandra/db/virtual/VirtualMutation.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
*/
1818
package org.apache.cassandra.db.virtual;
1919

20+
import java.io.IOException;
2021
import java.util.Collection;
22+
import java.util.Map;
2123
import java.util.concurrent.TimeUnit;
2224
import java.util.function.Predicate;
2325
import java.util.function.Supplier;
@@ -26,14 +28,27 @@
2628
import com.google.common.base.MoreObjects;
2729
import com.google.common.collect.ImmutableMap;
2830

31+
import accord.utils.Invariants;
2932
import org.apache.cassandra.config.DatabaseDescriptor;
3033
import org.apache.cassandra.db.DecoratedKey;
3134
import org.apache.cassandra.db.IMutation;
3235
import org.apache.cassandra.db.Mutation;
3336
import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
37+
import org.apache.cassandra.db.TypeSizes;
3438
import org.apache.cassandra.db.partitions.PartitionUpdate;
39+
import org.apache.cassandra.db.rows.DeserializationHelper;
40+
import org.apache.cassandra.io.IVersionedSerializer;
41+
import org.apache.cassandra.io.util.DataInputPlus;
42+
import org.apache.cassandra.io.util.DataOutputPlus;
43+
import org.apache.cassandra.net.IVerbHandler;
44+
import org.apache.cassandra.net.Message;
45+
import org.apache.cassandra.net.MessagingService;
46+
import org.apache.cassandra.net.NoPayload;
3547
import org.apache.cassandra.schema.TableId;
48+
import org.apache.cassandra.schema.TableMetadata;
49+
import org.apache.cassandra.serializers.CollectionSerializer;
3650
import org.apache.cassandra.service.ClientState;
51+
import org.apache.cassandra.utils.CollectionSerializers;
3752

3853
/**
3954
* A specialised IMutation implementation for virtual keyspaces.
@@ -42,6 +57,35 @@
4257
*/
4358
public final class VirtualMutation implements IMutation
4459
{
60+
public static final IVersionedSerializer<VirtualMutation> serializer = new IVersionedSerializer<VirtualMutation>()
61+
{
62+
@Override
63+
public void serialize(VirtualMutation t, DataOutputPlus out, int version) throws IOException
64+
{
65+
Invariants.require(t.modifications.size() == 1);
66+
PartitionUpdate.serializer.serialize(t.modifications.values().iterator().next(), out, version);
67+
}
68+
69+
@Override
70+
public VirtualMutation deserialize(DataInputPlus in, int version) throws IOException
71+
{
72+
PartitionUpdate update = PartitionUpdate.serializer.deserialize(in, version, DeserializationHelper.Flag.FROM_REMOTE);
73+
return new VirtualMutation(update);
74+
}
75+
76+
@Override
77+
public long serializedSize(VirtualMutation t, int version)
78+
{
79+
Invariants.require(t.modifications.size() == 1);
80+
return PartitionUpdate.serializer.serializedSize(t.modifications.values().iterator().next(), version);
81+
}
82+
};
83+
84+
public static final IVerbHandler<VirtualMutation> handler = message -> {
85+
message.payload.apply();
86+
MessagingService.instance().respond(NoPayload.noPayload, message);
87+
};
88+
4589
private final String keyspaceName;
4690
private final DecoratedKey partitionKey;
4791
private final ImmutableMap<TableId, PartitionUpdate> modifications;

src/java/org/apache/cassandra/db/virtual/VirtualTable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,6 @@ default boolean allowFilteringPrimaryKeysImplicitly()
100100
{
101101
return allowFilteringImplicitly();
102102
}
103+
104+
default Sorted sorted() { return Sorted.UNSORTED; }
103105
}

src/java/org/apache/cassandra/exceptions/ExceptionSerializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public class ExceptionSerializer
4646
{
4747
public static class RemoteException extends RuntimeException
4848
{
49-
private final String originalClass;
49+
public final String originalClass;
5050

5151
public RemoteException(String originalClass, String originalMessage, StackTraceElement[] stackTrace)
5252
{

src/java/org/apache/cassandra/exceptions/RequestFailure.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
import static com.google.common.base.Preconditions.checkNotNull;
3333
import static org.apache.cassandra.exceptions.ExceptionSerializer.nullableRemoteExceptionSerializer;
34+
import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN;
3435

3536
/**
3637
* Allow inclusion of a serialized exception in failure response messages
@@ -125,7 +126,7 @@ public static RequestFailure forException(Throwable t)
125126
if (t instanceof CoordinatorBehindException)
126127
return COORDINATOR_BEHIND;
127128

128-
return UNKNOWN;
129+
return new RequestFailure(t);
129130
}
130131

131132
public static RequestFailure forReason(RequestFailureReason reason)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.cassandra.locator;
20+
21+
import java.util.Map;
22+
23+
public class RemoteStrategy extends LocalStrategy
24+
{
25+
public RemoteStrategy(String keyspaceName, Map<String, String> configOptions)
26+
{
27+
super(keyspaceName, configOptions);
28+
}
29+
}

0 commit comments

Comments
 (0)