Merge branch 'main' into feature/logging-fix

pravinbhat · pravinbhat · commit a6f83cf4efcc · 2022-10-07T07:46:03.000-04:00
* main:
  Update README
  removing spark properties not in use
  intial commit
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Spark jobs in this repo can be used for data migration and data validation.
 
-> Please note: This job has been tested with spark version [2.4.8](https://downloads.apache.org/spark/spark-2.4.8/)
+> :warning: Please note this job has been tested with spark version [2.4.8](https://downloads.apache.org/spark/spark-2.4.8/)
 
 ## Prerequisite
 
@@ -16,7 +16,7 @@ tar -xvzf <spark downloaded file name>
 
 # Steps:
 
-1. sparkConf.properties file needs to be configured as applicable for the environment
+1. `sparkConf.properties` file needs to be configured as applicable for the environment
    > A sample Spark conf file configuration can be [found here](./src/resources/sparkConf.properties)
 2. Place the conf file where it can be accessed while running the job via spark-submit.
 3. Generate a fat jar (`cassandra-data-migrator-1.x.jar`) using command `mvn clean package`
@@ -63,10 +63,10 @@ spark.destination.autocorrect.mismatch                  true|false
 ```
 
 # Additional features
-- Counter tables
-- Preserve writetimes and TTL
-- Advanced DataTypes (Sets, Lists, Maps, UDTs)
+- [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
+- Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTL](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
+- Advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
 - Filter records from source using writetime
 - SSL Support (including custom cipher algorithms)
-- Migrate from any Cassandra source (Cassandra/DSE/Astra) to any Cassandra target (Cassandra/DSE/Astra)
+- Migrate from any Cassandra source ([Apache Cassandra](https://cassandra.apache.org)/[DataStax Enterprise (DSE)](https://www.datastax.com/products/datastax-enterprise)/[DataStax Astra DB](https://www.datastax.com/products/datastax-astra)) to any Cassandra target ([Apache Cassandra](https://cassandra.apache.org)/[DataStax Enterprise (DSE)](https://www.datastax.com/products/datastax-enterprise)/[DataStax Astra DB](https://www.datastax.com/products/datastax-astra))
 - Validate migration accuracy and performance using a smaller randomized data-set
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -14,46 +14,10 @@
 import java.util.Map;
 import java.util.Set;
 
-public abstract class AbstractJobSession {
-
-    // Read/Write Rate limiter
-    // Determine the total throughput for the entire cluster in terms of wries/sec,
-    // reads/sec
-    // then do the following to set the values as they are only applicable per JVM
-    // (hence spark Executor)...
-    // Rate = Total Throughput (write/read per sec) / Total Executors
-    protected final RateLimiter readLimiter;
-    protected final RateLimiter writeLimiter;
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-    protected PreparedStatement sourceSelectStatement;
-    protected String sourceSelectCondition;
-    protected PreparedStatement astraSelectStatement;
-    protected PreparedStatement astraInsertStatement;
-    protected Integer maxRetries = 10;
-
-    protected CqlSession sourceSession;
-    protected CqlSession astraSession;
-    protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
-    protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
-    protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
-
-    protected Integer batchSize = 1;
-    protected Integer printStatsAfter = 100000;
-
-    protected Boolean isPreserveTTLWritetime = Boolean.FALSE;
-    protected Boolean writeTimeStampFilter = Boolean.FALSE;
-    protected Long minWriteTimeStampFilter = 0l;
-    protected Long maxWriteTimeStampFilter = Long.MAX_VALUE;
-
-    protected List<Integer> writeTimeStampCols = new ArrayList<Integer>();
-    protected List<Integer> ttlCols = new ArrayList<Integer>();
-    protected Boolean isCounterTable;
-
-    protected String sourceKeyspaceTable;
-    protected String astraKeyspaceTable;
-
-    protected Boolean hasRandomPartitioner;
+public class AbstractJobSession extends BaseJobSession {
 
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+    
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
@@ -121,7 +85,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         selectColTypes = getTypes(sparkConf.get("spark.query.types"));
         String idCols = sparkConf.get("spark.query.destination.id", "");
         idColTypes = selectColTypes.subList(0, idCols.split(",").length);
-        sourceSelectCondition = sparkConf.get("spark.query.condition", "");
+        String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
         sourceSelectStatement = sourceSession.prepare(
                 "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
                         + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -0,0 +1,51 @@
+package datastax.astra.migrate;
+
+import com.datastax.oss.driver.api.core.CqlSession;
+import com.datastax.oss.driver.api.core.cql.PreparedStatement;
+import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public abstract  class BaseJobSession {
+
+    protected PreparedStatement sourceSelectStatement;
+    protected PreparedStatement astraSelectStatement;
+    protected PreparedStatement astraInsertStatement;
+
+    // Read/Write Rate limiter
+    // Determine the total throughput for the entire cluster in terms of wries/sec,
+    // reads/sec
+    // then do the following to set the values as they are only applicable per JVM
+    // (hence spark Executor)...
+    // Rate = Total Throughput (write/read per sec) / Total Executors
+    protected RateLimiter readLimiter;
+    protected RateLimiter writeLimiter;
+    protected Integer maxRetries = 10;
+
+    protected CqlSession sourceSession;
+    protected CqlSession astraSession;
+    protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
+    protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
+    protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
+
+    protected Integer batchSize = 1;
+    protected Integer printStatsAfter = 100000;
+
+    protected Boolean isPreserveTTLWritetime = Boolean.FALSE;
+    protected Boolean writeTimeStampFilter = Boolean.FALSE;
+    protected Long minWriteTimeStampFilter = 0l;
+    protected Long maxWriteTimeStampFilter = Long.MAX_VALUE;
+
+    protected List<Integer> writeTimeStampCols = new ArrayList<Integer>();
+    protected List<Integer> ttlCols = new ArrayList<Integer>();
+    protected Boolean isCounterTable;
+
+    protected String sourceKeyspaceTable;
+    protected String astraKeyspaceTable;
+
+    protected Boolean hasRandomPartitioner;
+
+}
diff --git a/src/main/scala/datastax/astra/migrate/AbstractJob.scala b/src/main/scala/datastax/astra/migrate/AbstractJob.scala
@@ -7,46 +7,7 @@ import org.slf4j.LoggerFactory
 import java.math.BigInteger
 import java.lang.Long
 
-class AbstractJob extends App {
-
-  val abstractLogger = LoggerFactory.getLogger(this.getClass.getName)
-  val spark = SparkSession.builder
-    .appName("Datastax Data Validation")
-    .getOrCreate()
-
-  val sc = spark.sparkContext
-
-  val sourceIsAstra = sc.getConf.get("spark.source.isAstra", "false")
-  val sourceScbPath = sc.getConf.get("spark.source.scb", "")
-  val sourceHost = sc.getConf.get("spark.source.host", "")
-  val sourceUsername = sc.getConf.get("spark.source.username", "")
-  val sourcePassword = sc.getConf.get("spark.source.password", "")
-  val sourceReadConsistencyLevel = sc.getConf.get("spark.source.read.consistency.level", "LOCAL_QUORUM")
-  val sourceTrustStorePath = sc.getConf.get("spark.source.trustStore.path", "")
-  val sourceTrustStorePassword = sc.getConf.get("spark.source.trustStore.password", "")
-  val sourceTrustStoreType = sc.getConf.get("spark.source.trustStore.type", "JKS")
-  val sourceKeyStorePath = sc.getConf.get("spark.source.keyStore.path", "")
-  val sourceKeyStorePassword = sc.getConf.get("spark.source.keyStore.password", "")
-  val sourceEnabledAlgorithms = sc.getConf.get("spark.source.enabledAlgorithms", "")
-
-  val destinationIsAstra = sc.getConf.get("spark.destination.isAstra", "true")
-  val destinationScbPath = sc.getConf.get("spark.destination.scb", "")
-  val destinationHost = sc.getConf.get("spark.destination.host", "")
-  val destinationUsername = sc.getConf.get("spark.destination.username")
-  val destinationPassword = sc.getConf.get("spark.destination.password")
-  val destinationReadConsistencyLevel = sc.getConf.get("spark.destination.read.consistency.level", "LOCAL_QUORUM")
-  val destinationTrustStorePath = sc.getConf.get("spark.destination.trustStore.path", "")
-  val destinationTrustStorePassword = sc.getConf.get("spark.destination.trustStore.password", "")
-  val destinationTrustStoreType = sc.getConf.get("spark.destination.trustStore.type", "JKS")
-  val destinationKeyStorePath = sc.getConf.get("spark.destination.keyStore.path", "")
-  val destinationKeyStorePassword = sc.getConf.get("spark.destination.keyStore.password", "")
-  val destinationEnabledAlgorithms = sc.getConf.get("spark.destination.enabledAlgorithms", "")
-
-  val minPartition = new BigInteger(sc.getConf.get("spark.source.minPartition","-9223372036854775808"))
-  val maxPartition = new BigInteger(sc.getConf.get("spark.source.maxPartition","9223372036854775807"))
-  val coveragePercent = sc.getConf.get("spark.coveragePercent", "100")
-  val splitSize = sc.getConf.get("spark.splitSize", "10000")
-  val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition,Integer.parseInt(coveragePercent))
+class AbstractJob extends BaseJob {
 
   abstractLogger.info("PARAM -- Min Partition: " + minPartition)
   abstractLogger.info("PARAM -- Max Partition: " + maxPartition)
@@ -60,11 +21,6 @@ class AbstractJob extends App {
   var destinationConnection = getConnection(false, destinationIsAstra, destinationScbPath, destinationHost, destinationUsername, destinationPassword, destinationReadConsistencyLevel,
     destinationTrustStorePath, destinationTrustStorePassword, destinationTrustStoreType, destinationKeyStorePath, destinationKeyStorePassword, destinationEnabledAlgorithms);
 
-  protected def exitSpark() = {
-    spark.stop()
-    sys.exit(0)
-  }
-
   private def getConnection(isSource: Boolean, isAstra: String, scbPath: String, host: String, username: String, password: String, readConsistencyLevel: String,
                             trustStorePath: String, trustStorePassword: String, trustStoreType: String,
                             keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = {
diff --git a/src/main/scala/datastax/astra/migrate/BaseJob.scala b/src/main/scala/datastax/astra/migrate/BaseJob.scala
@@ -0,0 +1,55 @@
+package datastax.astra.migrate
+
+import org.apache.spark.sql.SparkSession
+import org.slf4j.LoggerFactory
+
+import java.math.BigInteger
+import java.lang.Long
+
+class BaseJob extends App {
+
+  val abstractLogger = LoggerFactory.getLogger(this.getClass.getName)
+  val spark = SparkSession.builder
+    .appName("Datastax Data Validation")
+    .getOrCreate()
+
+  val sc = spark.sparkContext
+
+  val sourceIsAstra = sc.getConf.get("spark.source.isAstra", "false")
+  val sourceScbPath = sc.getConf.get("spark.source.scb", "")
+  val sourceHost = sc.getConf.get("spark.source.host", "")
+  val sourceUsername = sc.getConf.get("spark.source.username", "")
+  val sourcePassword = sc.getConf.get("spark.source.password", "")
+  val sourceReadConsistencyLevel = sc.getConf.get("spark.source.read.consistency.level", "LOCAL_QUORUM")
+  val sourceTrustStorePath = sc.getConf.get("spark.source.trustStore.path", "")
+  val sourceTrustStorePassword = sc.getConf.get("spark.source.trustStore.password", "")
+  val sourceTrustStoreType = sc.getConf.get("spark.source.trustStore.type", "JKS")
+  val sourceKeyStorePath = sc.getConf.get("spark.source.keyStore.path", "")
+  val sourceKeyStorePassword = sc.getConf.get("spark.source.keyStore.password", "")
+  val sourceEnabledAlgorithms = sc.getConf.get("spark.source.enabledAlgorithms", "")
+
+  val destinationIsAstra = sc.getConf.get("spark.destination.isAstra", "true")
+  val destinationScbPath = sc.getConf.get("spark.destination.scb", "")
+  val destinationHost = sc.getConf.get("spark.destination.host", "")
+  val destinationUsername = sc.getConf.get("spark.destination.username")
+  val destinationPassword = sc.getConf.get("spark.destination.password")
+  val destinationReadConsistencyLevel = sc.getConf.get("spark.destination.read.consistency.level", "LOCAL_QUORUM")
+  val destinationTrustStorePath = sc.getConf.get("spark.destination.trustStore.path", "")
+  val destinationTrustStorePassword = sc.getConf.get("spark.destination.trustStore.password", "")
+  val destinationTrustStoreType = sc.getConf.get("spark.destination.trustStore.type", "JKS")
+  val destinationKeyStorePath = sc.getConf.get("spark.destination.keyStore.path", "")
+  val destinationKeyStorePassword = sc.getConf.get("spark.destination.keyStore.password", "")
+  val destinationEnabledAlgorithms = sc.getConf.get("spark.destination.enabledAlgorithms", "")
+
+  val minPartition = new BigInteger(sc.getConf.get("spark.source.minPartition","-9223372036854775808"))
+  val maxPartition = new BigInteger(sc.getConf.get("spark.source.maxPartition","9223372036854775807"))
+  val coveragePercent = sc.getConf.get("spark.coveragePercent", "100")
+  val splitSize = sc.getConf.get("spark.splitSize", "10000")
+  val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition,Integer.parseInt(coveragePercent))
+
+  protected def exitSpark() = {
+    spark.stop()
+    sys.exit(0)
+  }
+
+}