Add support for RandomPartitioner

Ankitp1342 · Ankitp1342 · commit 4c6f976a27de · 2022-06-09T14:51:29.000-04:00
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -52,6 +52,9 @@ public abstract class AbstractJobSession {
     protected String sourceKeyspaceTable;
     protected String astraKeyspaceTable;
 
+
+    protected Boolean hasRandomPartitioner;
+
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
 
         this.sourceSession = sourceSession;
@@ -87,6 +90,8 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         logger.info(" DEFAULT -- WriteRateLimit: " + writeLimiter.getRate());
         logger.info(" DEFAULT -- WriteTimestampFilter: " + writeTimeStampFilter);
 
+        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.migrate.source.hasRandomPartitioner", "false"));
+
         isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.migrate.source.counterTable", "false"));
 
         counterDeltaMaxIndex = Integer
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -3,6 +3,7 @@
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.*;
 import com.datastax.oss.driver.internal.core.metadata.token.Murmur3Token;
+import com.datastax.oss.driver.internal.core.metadata.token.RandomToken;
 import org.apache.log4j.Logger;
 import org.apache.spark.SparkConf;
 
@@ -74,14 +75,15 @@ protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, Spar
 
     }
 
-    public void getDataAndInsert(Long min, Long max) {
+    public void getDataAndInsert(BigInteger min, BigInteger max) {
         logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
         int maxAttempts = maxRetries;
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
 
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(new Murmur3Token(min), new Murmur3Token(max)));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()));
+
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -7,6 +7,7 @@
 import org.apache.log4j.Logger;
 import org.apache.spark.SparkConf;
 
+import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.ForkJoinPool;
@@ -45,6 +46,7 @@ public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession as
                 }
             }
         }
+
         return diffJobSession;
     }
 
@@ -54,7 +56,7 @@ private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkC
         selectColTypes = getTypes(sparkConf.get("spark.migrate.diff.select.types"));
     }
 
-    public void getDataAndDiff(Long min, Long max) {
+    public void getDataAndDiff(BigInteger min, BigInteger max) {
         ForkJoinPool customThreadPool = new ForkJoinPool();
         logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
         int maxAttempts = maxRetries;
@@ -63,7 +65,7 @@ public void getDataAndDiff(Long min, Long max) {
             try {
                 // cannot do batching if the writeFilter is greater than 0
                 ResultSet resultSet = sourceSession.execute(
-                        sourceSelectStatement.bind(min, max).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
+                        sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
 
                 customThreadPool.submit(() -> {
                     StreamSupport.stream(resultSet.spliterator(), true).forEach(sRow -> {
diff --git a/src/main/java/datastax/astra/migrate/DiffMetaJobSession.java b/src/main/java/datastax/astra/migrate/DiffMetaJobSession.java
@@ -7,6 +7,7 @@
 import org.apache.spark.SparkConf;
 import org.spark_project.jetty.util.ConcurrentHashSet;
 
+import java.math.BigInteger;
 import java.util.*;
 import java.util.concurrent.CompletionStage;
 import java.util.concurrent.atomic.AtomicLong;
@@ -93,7 +94,7 @@ private DiffMetaJobSession(CqlSession sourceSession, CqlSession astraSession, Sp
     }
 
 
-    public void getDataDiffAndCorrect(Long min, Long max) {
+    public void getDataDiffAndCorrect(BigInteger min, BigInteger max) {
         try {
             correctData(getDataAndDiff(min, max));
             logger.info("ThreadID: " + Thread.currentThread().getId() + " CorrectFinal Read Record Count: " + readCounter.get());
@@ -108,14 +109,14 @@ public void getDataDiffAndCorrect(Long min, Long max) {
         }
     }
 
-    private Set<SrcDestKey> getDataAndDiff(Long min, Long max) {
+    private Set<SrcDestKey> getDataAndDiff(BigInteger min, BigInteger max) {
         Set<SrcDestKey> srcDestKeys = new HashSet<SrcDestKey>();
         logger.info("ThreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
         int maxAttempts = maxRetries;
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(min, max).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
                 for (Row sourceRow : resultSet) {
                     readLimiter.acquire(1);
                     // do not process rows less than minWriteTimeStampFilter or more than
diff --git a/src/main/java/datastax/astra/migrate/SplitPartitions.java b/src/main/java/datastax/astra/migrate/SplitPartitions.java
@@ -1,6 +1,9 @@
 package datastax.astra.migrate;
 
+import com.datastax.oss.driver.internal.core.metadata.token.RandomToken;
+import com.datastax.spark.connector.rdd.partitioner.dht.TokenFactory;
 import org.apache.log4j.Logger;
+import scala.math.BigInt;
 
 import java.io.Serializable;
 import java.math.BigInteger;
@@ -15,9 +18,12 @@ public class SplitPartitions {
     public final static Long MIN_PARTITION = Long.MIN_VALUE;
     public final static Long MAX_PARTITION  = Long.MAX_VALUE;
 
+    public static final BigInteger MIN_RANDOM = new BigInteger("-1");
+    public static final BigInteger MAX_RANDOM = (new BigInteger("2")).pow(127);
+
 
     public static void main(String[] args){
-        Collection<Partition> partitions = getSubPartitions(new BigInteger("10"), BigInteger.valueOf(MIN_PARTITION), BigInteger.valueOf(MAX_PARTITION));
+        Collection<Partition> partitions = getSubPartitions(new BigInteger("20"), MIN_RANDOM, MAX_RANDOM);
         for(Partition partition: partitions){
             System.out.println(partition);
         }
@@ -32,23 +38,53 @@ public static Collection<Partition> getRandomSubPartitions(BigInteger splitSize,
         Collections.shuffle(partitions);
         return partitions;
     }
+//    private static List<Partition> getSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max){
+//        long curMax = min.longValueExact();
+//        long partitionSize =  max.subtract(min).divide(splitSize).longValueExact();
+//        List<Partition> partitions = new ArrayList<Partition>();
+//        if(partitionSize==0){
+//            partitionSize=100000;
+//        }
+//        boolean exausted = false;
+//        while(curMax<=max.longValueExact()){
+//            long curMin = curMax;
+//            long newCurMax = curMin + partitionSize;
+//            if (newCurMax < curMax) {
+//                newCurMax = max.longValueExact();
+//                exausted = true;
+//            }
+//            if(newCurMax > max.longValueExact()){
+//                newCurMax=max.longValueExact();
+//                exausted=true;
+//            }
+//            curMax = newCurMax;
+//            partitions.add(new Partition(curMin,curMax));
+//            if(exausted){
+//                break;
+//            }
+//        }
+//
+//        return partitions;
+//    }
+
+
     private static List<Partition> getSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max){
-        long curMax = min.longValueExact();
-        long partitionSize =  max.subtract(min).divide(splitSize).longValueExact();
+        BigInteger curMax = new BigInteger(min.toString());
+        BigInteger partitionSize =  max.subtract(min).divide(splitSize);
         List<Partition> partitions = new ArrayList<Partition>();
-        if(partitionSize==0){
-            partitionSize=100000;
+        if(partitionSize.compareTo(new BigInteger("0"))==0){
+            partitionSize=new BigInteger("100000");
         }
         boolean exausted = false;
-        while(curMax<=max.longValueExact()){
-            long curMin = curMax;
-            long newCurMax = curMin + partitionSize;
-            if (newCurMax < curMax) {
-                newCurMax = max.longValueExact();
+        while(curMax.compareTo(max) <=0){
+            BigInteger curMin = new BigInteger(curMax.toString());
+            BigInteger newCurMax = curMin.add(partitionSize);
+            if (newCurMax.compareTo(curMax) == -1) {
+                newCurMax = new BigInteger(max.toString());
                 exausted = true;
             }
-            if(newCurMax > max.longValueExact()){
-                newCurMax=max.longValueExact();
+            if (newCurMax.compareTo(max)==1){
+                newCurMax = new BigInteger(max.toString());
                 exausted=true;
             }
             curMax = newCurMax;
@@ -63,27 +99,32 @@ private static List<Partition> getSubPartitions(BigInteger splitSize, BigInteger
 
 
 
-
     public static class Partition implements Serializable{
         private static final long serialVersionUID = 1L;
 
-        private Long min;
-        private Long max;
-        public Partition(Long min, Long max){
+        private BigInteger min;
+        private BigInteger max;
+
+
+        public Partition(BigInteger min, BigInteger max){
             this.min = min;
             this.max = max;
         }
 
-        public Long getMin() {
+
+
+        public BigInteger getMin() {
             return min;
         }
 
-        public Long getMax() {
+        public BigInteger getMax() {
             return max;
         }
 
         public String toString(){
-            return "--conf spark.migrate.source.minPartition="+ min + " --conf spark.migrate.source.maxPartition=" + max;
+//            return "--conf spark.migrate.source.minPartition="+ min + " --conf spark.migrate.source.maxPartition=" + max;
+
+            return "select * from field_api.field_users where token(account_id,field_id)>="+ min + " and token(account_id,field_id)<=" + max  + "  and account_id=ee8556f4-9a1a-4c89-ae05-e8105d42ed6f allow  filtering; ";
         }
     }
 }
diff --git a/src/resources/runCommands.txt b/src/resources/runCommands.txt
@@ -1,8 +1,11 @@
  wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.6.tgz
 
-  ~/Documents/Astra/spark-2.4.8-bin-hadoop2.7/bin//spark-submit --properties-file /Users/ankitpatel/Documents/spark-astra-migrator-ranges/src/resources/sparkConf.properties --verbose --master "local[8]" --conf spark.migrate.source.minPartition=-9223372036854775808 --conf spark.migrate.source.maxPartition=9223372036854775807 --class datastax.astra.migrate.Migrate /Users/ankitpatel/Documents/spark-astra-migrator-ranges/target/migrate-0.1.jar
+ --driver-memory 8G
 
+  ~/Documents/Astra/spark-2.4.8-bin-hadoop2.7/bin//spark-submit --properties-file /Users/ankitpatel/Documents/spark-astra-migrator-ranges/src/resources/sparkConf.properties --verbose --master "local[8]" --conf spark.migrate.source.minPartition=-9223372036854775808 --conf spark.migrate.source.maxPartition=9223372036854775807 --class datastax.astra.migrate.Migrate /Users/ankitpatel/Documents/spark-astra-migrator-ranges/target/migrate-0.1.jar
 
+Random Partitioner Run Command
+ ~/Documents/Astra/spark-2.4.8-bin-hadoop2.7/bin//spark-submit --properties-file /Users/ankitpatel/Documents/spark-astra-migrator-ranges/src/resources/sparkConf.properties --verbose --master "local[8]" --conf spark.migrate.source.minPartition=-1 --conf spark.migrate.source.maxPartition=170141183460469231731687303715884105728 --class datastax.astra.migrate.Migrate /Users/ankitpatel/Documents/spark-astra-migrator-ranges/target/migrate-0.9.jar
 
 
 //Diff Data

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`import org.apache.log4j.Logger;`
`8`	`8`	`import org.apache.spark.SparkConf;`
`9`	`9`
	`10`	`+import java.math.BigInteger;`
`10`	`11`	`import java.util.ArrayList;`
`11`	`12`	`import java.util.List;`
`12`	`13`	`import java.util.concurrent.ForkJoinPool;`
`@@ -45,6 +46,7 @@ public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession as`
`45`	`46`	`}`
`46`	`47`	`}`
`47`	`48`	`}`
	`49`	`+`
`48`	`50`	`return diffJobSession;`
`49`	`51`	`}`
`50`	`52`
`@@ -54,7 +56,7 @@ private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkC`
`54`	`56`	`selectColTypes = getTypes(sparkConf.get("spark.migrate.diff.select.types"));`
`55`	`57`	`}`
`56`	`58`
`57`		`- public void getDataAndDiff(Long min, Long max) {`
	`59`	`+ public void getDataAndDiff(BigInteger min, BigInteger max) {`
`58`	`60`	`ForkJoinPool customThreadPool = new ForkJoinPool();`
`59`	`61`	`logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);`
`60`	`62`	`int maxAttempts = maxRetries;`
`@@ -63,7 +65,7 @@ public void getDataAndDiff(Long min, Long max) {`
`63`	`65`	`try {`
`64`	`66`	`// cannot do batching if the writeFilter is greater than 0`
`65`	`67`	`ResultSet resultSet = sourceSession.execute(`
`66`		`- sourceSelectStatement.bind(min, max).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));`
	`68`	`+ sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));`
`67`	`69`
`68`	`70`	`customThreadPool.submit(() -> {`
`69`	`71`	`StreamSupport.stream(resultSet.spliterator(), true).forEach(sRow -> {`