Merge branch 'main' into feature/spark313_upgrade

faizalrub-datastax · faizalrub-datastax · commit 68c1587dfb0a · 2023-01-12T15:12:26.000-05:00
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -11,8 +11,8 @@ on:
       - 'Dockerfile'
       - 'LICENSE.md'
     tags:
-      - 'v*.*.*'
-      - 'v*.*.*-*'
+      - '*.*.*'
+      - '*.*.*-*'
 
 jobs:
   build_and_publish:
@@ -36,15 +36,15 @@ jobs:
             type=semver,pattern={{major}}.{{minor}}.x
             type=semver,pattern={{major}}.x
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v2
       - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKER_HUB_USERNAME }}
           password: ${{ secrets.DOCKER_HUB_PASSWORD }}
       - name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v3
         with:
           file: Dockerfile
           context: .
diff --git a/.github/workflows/docker-push-sha-commit.yml b/.github/workflows/docker-push-sha-commit.yml
@@ -22,15 +22,15 @@ jobs:
           tags: |
             type=sha
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v2
       - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKER_HUB_USERNAME }}
           password: ${{ secrets.DOCKER_HUB_PASSWORD }}
       - name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v3
         with:
           file: Dockerfile
           context: .
diff --git a/Dockerfile b/Dockerfile
@@ -13,7 +13,7 @@ RUN mkdir -p /assets/ && cd /assets && \
     tar -xzf ./spark-2.4.8-bin-hadoop2.7.tgz && \
     rm ./spark-2.4.8-bin-hadoop2.7.tgz
 
-RUN apt-get update && apt-get install -y openssh-server vim --no-install-recommends && \
+RUN apt-get update && apt-get install -y openssh-server vim python3 --no-install-recommends && \
     rm -rf /var/lib/apt/lists/*  && \
     service ssh start
 
diff --git a/README.md b/README.md
@@ -4,22 +4,28 @@ Migrate and Validate Tables between Origin and Target Cassandra Clusters.
 
 > :warning: Please note this job has been tested with spark version [2.4.8](https://archive.apache.org/dist/spark/spark-2.4.8/)
 
-## Build
-1. Clone this repo
-2. Move to the repo folder `cd cassandra-data-migrator`
-3. Run the build `mvn clean package`
-4. The fat jar (`cassandra-data-migrator-2.x.jar`) file should now be present in the `target` folder
+## Container Image
+- Get the latest image that includes all dependencies from [DockerHub](https://hub.docker.com/r/datastax/cassandra-data-migrator) 
+  - If you use this route, all migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) would be available in the `/assets/` folder of the container
+- OR follow the below build steps (and Prerequisite) to build the jar locally
 
-## Prerequisite
+### Prerequisite
 
-Install Java8 as spark binaries are compiled with it.
-Install single instance of spark on a node where you want to run this job. Spark can be installed by running the following: -
+- Install Java8 as spark binaries are compiled with it.
+- Install Maven 3.8.x
+- Install single instance of spark on a node where you want to run this job. Spark can be installed by running the following: -
 
 ```
 wget https://downloads.apache.org/spark/spark-2.4.8/
 tar -xvzf <spark downloaded file name>
 ```
 
+### Build
+1. Clone this repo
+2. Move to the repo folder `cd cassandra-data-migrator`
+3. Run the build `mvn clean package`
+4. The fat jar (`cassandra-data-migrator-2.x.x.jar`) file should now be present in the `target` folder
+
 # Steps for Data-Migration:
 
 1. `sparkConf.properties` file needs to be configured as applicable for the environment
@@ -30,7 +36,7 @@ tar -xvzf <spark downloaded file name>
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.Migrate cassandra-data-migrator-2.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.Migrate cassandra-data-migrator-2.x.x.jar &> logfile_name.txt
 ```
 
 Note: Above command also generates a log file `logfile_name.txt` to avoid log output on the console.
@@ -43,7 +49,7 @@ Note: Above command also generates a log file `logfile_name.txt` to avoid log ou
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.DiffData cassandra-data-migrator-2.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.DiffData cassandra-data-migrator-2.x.x.jar &> logfile_name.txt
 ```
 
 - Validation job will report differences as “ERRORS” in the log file as shown below
@@ -72,7 +78,7 @@ spark.target.autocorrect.mismatch                   true|false
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-2.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-2.x.x.jar &> logfile_name.txt
 ```
 
 When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder in the below format, where each line (`min,max`) represents a partition-range 
@@ -88,7 +94,8 @@ This mode is specifically useful to processes a subset of partition-ranges that
 - [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTL](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
-- Filter records from origin using writetime
+- Filter records from origin using writetimes, CQL conditions, token-ranges
+- Fully containerized (Docker and K8s friendly)
 - SSL Support (including custom cipher algorithms)
 - Migrate from any Cassandra origin ([Apache Cassandra](https://cassandra.apache.org) / [DataStax Enterprise](https://www.datastax.com/products/datastax-enterprise) / [DataStax Astra DB](https://www.datastax.com/products/datastax-astra)) to any Cassandra target ([Apache Cassandra](https://cassandra.apache.org) / [DataStax Enterprise](https://www.datastax.com/products/datastax-enterprise) / [DataStax Astra DB](https://www.datastax.com/products/datastax-astra))
 - Validate migration accuracy and performance using a smaller randomized data-set
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>2.10</version>
+  <version>2.11.0</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -24,6 +24,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
     }
 
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc, boolean isJobMigrateRowsFromFile) {
+        super(sc);
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
@@ -79,6 +80,8 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
             customWritetime = Long.parseLong(customWriteTimeStr);
         }
 
+        logger.info("PARAM -- Read Consistency: {}", readConsistencyLevel);
+        logger.info("PARAM -- Write Consistency: {}", writeConsistencyLevel);
         logger.info("PARAM -- Write Batch Size: {}", batchSize);
         logger.info("PARAM -- Read Fetch Size: {}", fetchSizeInRows);
         logger.info("PARAM -- Source Keyspace Table: {}", sourceKeyspaceTable);
@@ -98,6 +101,9 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         String selectCols = Util.getSparkProp(sc, "spark.query.origin");
         String partionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
         String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
+        if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
+            sourceSelectCondition = " AND " + sourceSelectCondition;
+        }
 
         final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
         String[] allCols = selectCols.split(",");
@@ -172,7 +178,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
     }
 
     public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow, Row astraRow) {
-        BoundStatement boundInsertStatement = insertStatement.bind();
+        BoundStatement boundInsertStatement = insertStatement.bind().setConsistencyLevel(writeConsistencyLevel);
 
         if (isCounterTable) {
             for (int index = 0; index < selectColTypes.size(); index++) {
@@ -232,7 +238,7 @@ public long getLargestWriteTimeStamp(Row sourceRow) {
     }
 
     public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sourceRow) {
-        BoundStatement boundSelectStatement = selectStatement.bind();
+        BoundStatement boundSelectStatement = selectStatement.bind().setConsistencyLevel(readConsistencyLevel);
         for (int index = 0; index < idColTypes.size(); index++) {
             MigrateDataType dataType = idColTypes.get(index);
             boundSelectStatement = boundSelectStatement.set(index, getData(dataType, index, sourceRow),
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -1,9 +1,11 @@
 package datastax.astra.migrate;
 
+import com.datastax.oss.driver.api.core.ConsistencyLevel;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import org.apache.spark.SparkConf;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -15,6 +17,8 @@ public abstract class BaseJobSession {
     protected PreparedStatement sourceSelectStatement;
     protected PreparedStatement astraSelectStatement;
     protected PreparedStatement astraInsertStatement;
+    protected ConsistencyLevel readConsistencyLevel;
+    protected ConsistencyLevel writeConsistencyLevel;
 
     // Read/Write Rate limiter
     // Determine the total throughput for the entire cluster in terms of wries/sec,
@@ -55,6 +59,11 @@ public abstract class BaseJobSession {
     protected Integer filterColIndex;
     protected String filterColValue;
 
+    protected BaseJobSession(SparkConf sc) {
+        readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read"));
+        writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write"));
+    }
+
     public String getKey(Row sourceRow) {
         StringBuffer key = new StringBuffer();
         for (int index = 0; index < idColTypes.size(); index++) {
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -47,8 +47,10 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(),
-                        hasRandomPartitioner ? max : max.longValueExact()).setPageSize(fetchSizeInRows));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
+
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
diff --git a/src/main/java/datastax/astra/migrate/CopyPKJobSession.java b/src/main/java/datastax/astra/migrate/CopyPKJobSession.java
@@ -43,7 +43,7 @@ public void getRowAndInsert(List<SplitPartitions.PKRows> rowsList) {
                 readCounter.incrementAndGet();
                 String[] pkFields = row.split(" %% ");
                 int idx = 0;
-                BoundStatement bspk = sourceSelectStatement.bind();
+                BoundStatement bspk = sourceSelectStatement.bind().setConsistencyLevel(readConsistencyLevel);
                 for (MigrateDataType tp : idColTypes) {
                     bspk = bspk.set(idx, convert(tp.typeClass, pkFields[idx]), tp.typeClass);
                     idx++;
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -1,6 +1,5 @@
 package datastax.astra.migrate;
 
-import com.datastax.oss.driver.api.core.ConsistencyLevel;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.AsyncResultSet;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
@@ -61,9 +60,9 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
 
             try {
                 // cannot do batching if the writeFilter is greater than 0
-                ResultSet resultSet = sourceSession.execute(
-                        sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
-                                .setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM).setPageSize(fetchSizeInRows));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
 
                 Map<Row, CompletionStage<AsyncResultSet>> srcToTargetRowMap = new HashMap<Row, CompletionStage<AsyncResultSet>>();
                 StreamSupport.stream(resultSet.spliterator(), false).forEach(srcRow -> {
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -26,41 +26,41 @@ public class OriginCountJobSession extends BaseJobSession {
     protected Integer fieldGuardraillimitMB;
     protected List<MigrateDataType> checkTableforColSizeTypes = new ArrayList<MigrateDataType>();
 
-    protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
+    protected OriginCountJobSession(CqlSession sourceSession, SparkConf sc) {
+        super(sc);
         this.sourceSession = sourceSession;
-        batchSize = new Integer(sparkConf.get("spark.batchSize", "1"));
-        printStatsAfter = new Integer(sparkConf.get("spark.printStatsAfter", "100000"));
+        batchSize = new Integer(sc.get("spark.batchSize", "1"));
+        printStatsAfter = new Integer(sc.get("spark.printStatsAfter", "100000"));
         if (printStatsAfter < 1) {
             printStatsAfter = 100000;
         }
 
-        readLimiter = RateLimiter.create(new Integer(sparkConf.get("spark.readRateLimit", "20000")));
-        sourceKeyspaceTable = sparkConf.get("spark.origin.keyspaceTable");
+        readLimiter = RateLimiter.create(new Integer(sc.get("spark.readRateLimit", "20000")));
+        sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable");
 
-        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.origin.hasRandomPartitioner", "false"));
-        isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.counterTable", "false"));
+        hasRandomPartitioner = Boolean.parseBoolean(sc.get("spark.origin.hasRandomPartitioner", "false"));
+        isCounterTable = Boolean.parseBoolean(sc.get("spark.counterTable", "false"));
 
-        checkTableforColSize = Boolean.parseBoolean(sparkConf.get("spark.origin.checkTableforColSize", "false"));
-        checkTableforselectCols = sparkConf.get("spark.origin.checkTableforColSize.cols");
-        checkTableforColSizeTypes = getTypes(sparkConf.get("spark.origin.checkTableforColSize.cols.types"));
-        filterColName = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumn");
-        filterColType = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumnType");
-        filterColIndex = Integer.parseInt(sparkConf.get("spark.origin.FilterColumnIndex", "0"));
-        fieldGuardraillimitMB = Integer.parseInt(sparkConf.get("spark.fieldGuardraillimitMB", "0"));
+        checkTableforColSize = Boolean.parseBoolean(sc.get("spark.origin.checkTableforColSize", "false"));
+        checkTableforselectCols = sc.get("spark.origin.checkTableforColSize.cols");
+        checkTableforColSizeTypes = getTypes(sc.get("spark.origin.checkTableforColSize.cols.types"));
+        filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn");
+        filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType");
+        filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0"));
+        fieldGuardraillimitMB = Integer.parseInt(sc.get("spark.fieldGuardraillimitMB", "0"));
 
-        String partionKey = sparkConf.get("spark.query.cols.partitionKey");
-        idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
+        String partionKey = sc.get("spark.query.cols.partitionKey");
+        idColTypes = getTypes(sc.get("spark.query.cols.id.types"));
 
-        String selectCols = sparkConf.get("spark.query.cols.select");
-        String updateSelectMappingStr = sparkConf.get("spark.counterTable.cql.index", "0");
+        String selectCols = sc.get("spark.query.cols.select");
+        String updateSelectMappingStr = sc.get("spark.counterTable.cql.index", "0");
         for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
             updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
         }
-        String sourceSelectCondition = sparkConf.get("spark.query.cols.select.condition", "");
+        String sourceSelectCondition = sc.get("spark.query.cols.select.condition", "");
         sourceSelectStatement = sourceSession.prepare(
                 "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
                         + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
-
     }
 
     public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) {
@@ -81,7 +81,10 @@ public void getData(BigInteger min, BigInteger max) {
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()));
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
+                                min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
+                        .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows));
+
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
@@ -106,7 +109,6 @@ public void getData(BigInteger min, BigInteger max) {
                             }
                         }
                     }
-
                 } else {
                     BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
                     for (Row sourceRow : resultSet) {
@@ -143,7 +145,6 @@ public void getData(BigInteger min, BigInteger max) {
                         Thread.currentThread().getId(), min, max, retryCount);
             }
         }
-
     }
 
     private int GetRowColumnLength(Row sourceRow, String filterColType, Integer filterColIndex) {
diff --git a/src/main/java/datastax/astra/migrate/Util.java b/src/main/java/datastax/astra/migrate/Util.java
diff --git a/src/resources/runCommands.txt b/src/resources/runCommands.txt
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties