Merge pull request #56 from datastax/feature/consistency-bug

pravinbhat · web-flow · commit 7a524a752d07 · 2023-01-09T09:35:43.000-05:00
Use applicable read/write consistency from properties.
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>2.10.1</version>
+  <version>2.11.0</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -24,6 +24,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
     }
 
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc, boolean isJobMigrateRowsFromFile) {
+        super(sc);
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
@@ -79,6 +80,8 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
             customWritetime = Long.parseLong(customWriteTimeStr);
         }
 
+        logger.info("PARAM -- Read Consistency: {}", readConsistencyLevel);
+        logger.info("PARAM -- Write Consistency: {}", writeConsistencyLevel);
         logger.info("PARAM -- Write Batch Size: {}", batchSize);
         logger.info("PARAM -- Read Fetch Size: {}", fetchSizeInRows);
         logger.info("PARAM -- Source Keyspace Table: {}", sourceKeyspaceTable);
@@ -98,6 +101,9 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         String selectCols = Util.getSparkProp(sc, "spark.query.origin");
         String partionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
         String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
+        if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
+            sourceSelectCondition = " AND " + sourceSelectCondition;
+        }
 
         final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
         String[] allCols = selectCols.split(",");
@@ -172,7 +178,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
     }
 
     public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow, Row astraRow) {
-        BoundStatement boundInsertStatement = insertStatement.bind();
+        BoundStatement boundInsertStatement = insertStatement.bind().setConsistencyLevel(writeConsistencyLevel);
 
         if (isCounterTable) {
             for (int index = 0; index < selectColTypes.size(); index++) {
@@ -232,7 +238,7 @@ public long getLargestWriteTimeStamp(Row sourceRow) {
     }
 
     public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sourceRow) {
-        BoundStatement boundSelectStatement = selectStatement.bind();
+        BoundStatement boundSelectStatement = selectStatement.bind().setConsistencyLevel(readConsistencyLevel);
         for (int index = 0; index < idColTypes.size(); index++) {
             MigrateDataType dataType = idColTypes.get(index);
             boundSelectStatement = boundSelectStatement.set(index, getData(dataType, index, sourceRow),
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -1,9 +1,11 @@
 package datastax.astra.migrate;
 
+import com.datastax.oss.driver.api.core.ConsistencyLevel;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import org.apache.spark.SparkConf;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -15,6 +17,8 @@ public abstract class BaseJobSession {
     protected PreparedStatement sourceSelectStatement;
     protected PreparedStatement astraSelectStatement;
     protected PreparedStatement astraInsertStatement;
+    protected ConsistencyLevel readConsistencyLevel;
+    protected ConsistencyLevel writeConsistencyLevel;
 
     // Read/Write Rate limiter
     // Determine the total throughput for the entire cluster in terms of wries/sec,
@@ -55,6 +59,11 @@ public abstract class BaseJobSession {
     protected Integer filterColIndex;
     protected String filterColValue;
 
+    protected BaseJobSession(SparkConf sc) {
+        readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read"));
+        writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write"));
+    }
+
     public String getKey(Row sourceRow) {
         StringBuffer key = new StringBuffer();
         for (int index = 0; index < idColTypes.size(); index++) {
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -47,8 +47,10 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(),
-                        hasRandomPartitioner ? max : max.longValueExact()).setPageSize(fetchSizeInRows));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
+
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
diff --git a/src/main/java/datastax/astra/migrate/CopyPKJobSession.java b/src/main/java/datastax/astra/migrate/CopyPKJobSession.java
@@ -43,7 +43,7 @@ public void getRowAndInsert(List<SplitPartitions.PKRows> rowsList) {
                 readCounter.incrementAndGet();
                 String[] pkFields = row.split(" %% ");
                 int idx = 0;
-                BoundStatement bspk = sourceSelectStatement.bind();
+                BoundStatement bspk = sourceSelectStatement.bind().setConsistencyLevel(readConsistencyLevel);
                 for (MigrateDataType tp : idColTypes) {
                     bspk = bspk.set(idx, convert(tp.typeClass, pkFields[idx]), tp.typeClass);
                     idx++;
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -1,6 +1,5 @@
 package datastax.astra.migrate;
 
-import com.datastax.oss.driver.api.core.ConsistencyLevel;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.AsyncResultSet;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
@@ -61,9 +60,9 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
 
             try {
                 // cannot do batching if the writeFilter is greater than 0
-                ResultSet resultSet = sourceSession.execute(
-                        sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
-                                .setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM).setPageSize(fetchSizeInRows));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
 
                 Map<Row, CompletionStage<AsyncResultSet>> srcToTargetRowMap = new HashMap<Row, CompletionStage<AsyncResultSet>>();
                 StreamSupport.stream(resultSet.spliterator(), false).forEach(srcRow -> {
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -26,41 +26,41 @@ public class OriginCountJobSession extends BaseJobSession {
     protected Integer fieldGuardraillimitMB;
     protected List<MigrateDataType> checkTableforColSizeTypes = new ArrayList<MigrateDataType>();
 
-    protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
+    protected OriginCountJobSession(CqlSession sourceSession, SparkConf sc) {
+        super(sc);
         this.sourceSession = sourceSession;
-        batchSize = new Integer(sparkConf.get("spark.batchSize", "1"));
-        printStatsAfter = new Integer(sparkConf.get("spark.printStatsAfter", "100000"));
+        batchSize = new Integer(sc.get("spark.batchSize", "1"));
+        printStatsAfter = new Integer(sc.get("spark.printStatsAfter", "100000"));
         if (printStatsAfter < 1) {
             printStatsAfter = 100000;
         }
 
-        readLimiter = RateLimiter.create(new Integer(sparkConf.get("spark.readRateLimit", "20000")));
-        sourceKeyspaceTable = sparkConf.get("spark.origin.keyspaceTable");
+        readLimiter = RateLimiter.create(new Integer(sc.get("spark.readRateLimit", "20000")));
+        sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable");
 
-        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.origin.hasRandomPartitioner", "false"));
-        isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.counterTable", "false"));
+        hasRandomPartitioner = Boolean.parseBoolean(sc.get("spark.origin.hasRandomPartitioner", "false"));
+        isCounterTable = Boolean.parseBoolean(sc.get("spark.counterTable", "false"));
 
-        checkTableforColSize = Boolean.parseBoolean(sparkConf.get("spark.origin.checkTableforColSize", "false"));
-        checkTableforselectCols = sparkConf.get("spark.origin.checkTableforColSize.cols");
-        checkTableforColSizeTypes = getTypes(sparkConf.get("spark.origin.checkTableforColSize.cols.types"));
-        filterColName = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumn");
-        filterColType = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumnType");
-        filterColIndex = Integer.parseInt(sparkConf.get("spark.origin.FilterColumnIndex", "0"));
-        fieldGuardraillimitMB = Integer.parseInt(sparkConf.get("spark.fieldGuardraillimitMB", "0"));
+        checkTableforColSize = Boolean.parseBoolean(sc.get("spark.origin.checkTableforColSize", "false"));
+        checkTableforselectCols = sc.get("spark.origin.checkTableforColSize.cols");
+        checkTableforColSizeTypes = getTypes(sc.get("spark.origin.checkTableforColSize.cols.types"));
+        filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn");
+        filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType");
+        filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0"));
+        fieldGuardraillimitMB = Integer.parseInt(sc.get("spark.fieldGuardraillimitMB", "0"));
 
-        String partionKey = sparkConf.get("spark.query.cols.partitionKey");
-        idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
+        String partionKey = sc.get("spark.query.cols.partitionKey");
+        idColTypes = getTypes(sc.get("spark.query.cols.id.types"));
 
-        String selectCols = sparkConf.get("spark.query.cols.select");
-        String updateSelectMappingStr = sparkConf.get("spark.counterTable.cql.index", "0");
+        String selectCols = sc.get("spark.query.cols.select");
+        String updateSelectMappingStr = sc.get("spark.counterTable.cql.index", "0");
         for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
             updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
         }
-        String sourceSelectCondition = sparkConf.get("spark.query.cols.select.condition", "");
+        String sourceSelectCondition = sc.get("spark.query.cols.select.condition", "");
         sourceSelectStatement = sourceSession.prepare(
                 "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
                         + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
-
     }
 
     public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) {
@@ -81,7 +81,10 @@ public void getData(BigInteger min, BigInteger max) {
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
+
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
@@ -106,7 +109,6 @@ public void getData(BigInteger min, BigInteger max) {
                             }
                         }
                     }
-
                 } else {
                     BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
                     for (Row sourceRow : resultSet) {
@@ -143,7 +145,6 @@ public void getData(BigInteger min, BigInteger max) {
                         Thread.currentThread().getId(), min, max, retryCount);
             }
         }
-
     }
 
     private int GetRowColumnLength(Row sourceRow, String filterColType, Integer filterColIndex) {
diff --git a/src/main/java/datastax/astra/migrate/Util.java b/src/main/java/datastax/astra/migrate/Util.java
@@ -1,5 +1,7 @@
 package datastax.astra.migrate;
 
+import com.datastax.oss.driver.api.core.ConsistencyLevel;
+import org.apache.commons.lang.StringUtils;
 import org.apache.spark.SparkConf;
 
 import java.io.BufferedReader;
@@ -39,4 +41,44 @@ public static BufferedReader getfileReader(String fileName) {
         }
     }
 
+    public static ConsistencyLevel mapToConsistencyLevel(String level) {
+        ConsistencyLevel retVal = ConsistencyLevel.LOCAL_QUORUM;
+        if (StringUtils.isNotEmpty(level)) {
+            switch (level.toUpperCase()) {
+                case "ANY":
+                    retVal = ConsistencyLevel.ANY;
+                    break;
+                case "ONE":
+                    retVal = ConsistencyLevel.ONE;
+                    break;
+                case "TWO":
+                    retVal = ConsistencyLevel.TWO;
+                    break;
+                case "THREE":
+                    retVal = ConsistencyLevel.THREE;
+                    break;
+                case "QUORUM":
+                    retVal = ConsistencyLevel.QUORUM;
+                    break;
+                case "LOCAL_ONE":
+                    retVal = ConsistencyLevel.LOCAL_ONE;
+                    break;
+                case "EACH_QUORUM":
+                    retVal = ConsistencyLevel.EACH_QUORUM;
+                    break;
+                case "SERIAL":
+                    retVal = ConsistencyLevel.SERIAL;
+                    break;
+                case "LOCAL_SERIAL":
+                    retVal = ConsistencyLevel.LOCAL_SERIAL;
+                    break;
+                case "ALL":
+                    retVal = ConsistencyLevel.ALL;
+                    break;
+            }
+        }
+
+        return retVal;
+    }
+
 }
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -18,13 +18,24 @@ spark.batchSize                                   5
 
 spark.query.origin                                partition-key,clustering-key,order-date,amount
 spark.query.origin.partitionKey                   partition-key
-spark.query.target                                partition-key,clustering-key,order-date,amount
 spark.query.target.id                             partition-key,clustering-key
 spark.query.types                                 9,1,4,3
 spark.query.ttl.cols                              2,3
 spark.query.writetime.cols                        2,3
 
-################################## ENABLE ONLY IF IT IS A COUNTER TABLE #####################################
+##### ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME) #####
+#spark.query.target                                partition-key,clustering-key,order-date,amount
+
+################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME DATA BASED ON CQL FILTER #################
+#spark.query.condition
+
+################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % (NOT 100%) DATA   ######################
+#spark.coveragePercent                             10
+
+#################### ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT #####################
+#spark.printStatsAfter                             100000
+
+################################# ENABLE ONLY IF IT IS A COUNTER TABLE ######################################
 #spark.counterTable                                false
 #spark.counterTable.cql
 #spark.counterTable.cql.index                      0
@@ -34,12 +45,17 @@ spark.query.writetime.cols                        2,3
 #spark.origin.minWriteTimeStampFilter              0
 #spark.origin.maxWriteTimeStampFilter              4102444800000000
 
-############################### ONLY CHANGE IF YOU KNOW WHAT YOU ARE DOING ##################################
-#spark.coveragePercent                             100
-#spark.printStatsAfter                             100000
-#spark.read.consistency.level                      LOCAL_QUORUM
+######## ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM  ##############
+#spark.consistency.read                            LOCAL_QUORUM
+#spark.consistency.write                           LOCAL_QUORUM
+
+############# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException  ##################
 #spark.read.fetch.sizeInRows                       1000
+
+############### ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET  ######################
 #spark.target.custom.writeTime                     0
+
+#################### ONLY USE if SKIPPING recs greater than 10MB from Origin needed #########################
 #spark.fieldGuardraillimitMB                       10
 
 #################### ONLY USE if count of recs greater than 10MB from Origin needed #########################
@@ -98,8 +114,4 @@ spark.query.writetime.cols                        2,3
 #  Note: The tool migrates TTL & Writetimes at row-level and not field-level.
 #        Migration will use the largest TTL & Writetimes value per row.
 #
-# "spark.target.custom.writeTime" - User specified writetime. When set, this static value will be used as target writetime.
-#
-# Default value for "spark.origin.maxWriteTimeStampFilter" is "9223372036854775807" (max long value)
-#
-#############################################################################################################
+#############################################################################################################