Implemented flex-schema. Destination schema column name can be different from source.

pravinbhat · pravinbhat · commit d3b19ae76486 · 2022-10-06T10:28:11.000-04:00
Also simplified config - No need to pass destination/insert schema if its same as source, No need to id-types as its auto-computed.
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>1.3</version>
+  <version>1.4</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -16,13 +16,6 @@
 
 public abstract class AbstractJobSession {
 
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-
-    protected PreparedStatement sourceSelectStatement;
-    protected String sourceSelectCondition;
-
-    protected PreparedStatement astraSelectStatement;
-
     // Read/Write Rate limiter
     // Determine the total throughput for the entire cluster in terms of wries/sec,
     // reads/sec
@@ -31,12 +24,18 @@ public abstract class AbstractJobSession {
     // Rate = Total Throughput (write/read per sec) / Total Executors
     protected final RateLimiter readLimiter;
     protected final RateLimiter writeLimiter;
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+    protected PreparedStatement sourceSelectStatement;
+    protected String sourceSelectCondition;
+    protected PreparedStatement astraSelectStatement;
+    protected PreparedStatement astraInsertStatement;
     protected Integer maxRetries = 10;
 
     protected CqlSession sourceSession;
     protected CqlSession astraSession;
     protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
     protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
+    protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
 
     protected Integer batchSize = 1;
     protected Integer printStatsAfter = 100000;
@@ -106,45 +105,69 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
             maxWriteTimeStampFilter = Long.parseLong(maxWriteTimeStampFilterStr);
         }
 
-        logger.info(" DEFAULT -- Write Batch Size: " + batchSize);
-        logger.info(" DEFAULT -- Source Keyspace Table: " + sourceKeyspaceTable);
-        logger.info(" DEFAULT -- Destination Keyspace Table: " + astraKeyspaceTable);
-        logger.info(" DEFAULT -- ReadRateLimit: " + readLimiter.getRate());
-        logger.info(" DEFAULT -- WriteRateLimit: " + writeLimiter.getRate());
-        logger.info(" DEFAULT -- WriteTimestampFilter: " + writeTimeStampFilter);
-        logger.info(" DEFAULT -- WriteTimestampFilterCols: " + writeTimeStampCols);
-        logger.info(" DEFAULT -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
-        logger.info(" DEFAULT -- TTLCols: " + ttlCols);
+        logger.info("PARAM -- Write Batch Size: " + batchSize);
+        logger.info("PARAM -- Source Keyspace Table: " + sourceKeyspaceTable);
+        logger.info("PARAM -- Destination Keyspace Table: " + astraKeyspaceTable);
+        logger.info("PARAM -- ReadRateLimit: " + readLimiter.getRate());
+        logger.info("PARAM -- WriteRateLimit: " + writeLimiter.getRate());
+        logger.info("PARAM -- WriteTimestampFilter: " + writeTimeStampFilter);
+        logger.info("PARAM -- WriteTimestampFilterCols: " + writeTimeStampCols);
+        logger.info("PARAM -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
+        logger.info("PARAM -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
+        logger.info("PARAM -- TTLCols: " + ttlCols);
+
+        String selectCols = sparkConf.get("spark.query.source");
+        String partionKey = sparkConf.get("spark.query.source.partitionKey");
+        selectColTypes = getTypes(sparkConf.get("spark.query.types"));
+        String idCols = sparkConf.get("spark.query.destination.id", "");
+        idColTypes = selectColTypes.subList(0, idCols.split(",").length);
+        sourceSelectCondition = sparkConf.get("spark.query.condition", "");
+        sourceSelectStatement = sourceSession.prepare(
+                "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
+                        + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
 
-        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.source.hasRandomPartitioner", "false"));
+        String insertCols = sparkConf.get("spark.query.destination", "");
+        if (null == insertCols || insertCols.trim().isEmpty()) {
+            insertCols = selectCols;
+        }
+        String insertBinds = "";
+        for (String str : idCols.split(",")) {
+            if (insertBinds.isEmpty()) {
+                insertBinds = str + "= ?";
+            } else {
+                insertBinds += " and " + str + "= ?";
+            }
+        }
+        astraSelectStatement = astraSession.prepare(
+                "select " + insertCols + " from " + astraKeyspaceTable
+                        + " where " + insertBinds);
 
+        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.source.hasRandomPartitioner", "false"));
         isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.counterTable", "false"));
-        selectColTypes = getTypes(sparkConf.get("spark.diff.select.types"));
-        String partionKey = sparkConf.get("spark.query.cols.partitionKey");
-        String idCols = sparkConf.get("spark.query.cols.id");
-        idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
+        if (isCounterTable) {
+            String updateSelectMappingStr = sparkConf.get("spark.counterTable.cql.index", "0");
+            for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
+                updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
+            }
 
-        String selectCols = sparkConf.get("spark.query.cols.select");
+            String counterTableUpdate = sparkConf.get("spark.counterTable.cql");
+            astraInsertStatement = astraSession.prepare(counterTableUpdate);
+        } else {
+            insertBinds = "";
+            for (String str : insertCols.split(",")) {
+                if (insertBinds.isEmpty()) {
+                    insertBinds += "?";
+                } else {
+                    insertBinds += ", ?";
+                }
+            }
 
-        String idBinds = "";
-        int count = 1;
-        for (String str : idCols.split(",")) {
-            if (count > 1) {
-                idBinds = idBinds + " and " + str + "= ?";
+            if (isPreserveTTLWritetime) {
+                astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ") using TTL ? and TIMESTAMP ?");
             } else {
-                idBinds = str + "= ?";
+                astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")");
             }
-            count++;
         }
-
-        sourceSelectCondition = sparkConf.get("spark.query.cols.select.condition", "");
-        sourceSelectStatement = sourceSession.prepare(
-                "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
-                        + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
-
-        astraSelectStatement = astraSession.prepare(
-                "select " + selectCols + " from " + astraKeyspaceTable
-                        + " where " + idBinds);
     }
 
     public List<MigrateDataType> getTypes(String types) {
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -9,22 +9,20 @@
 import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.List;
 import java.util.Map;
 import java.util.concurrent.CompletionStage;
 import java.util.concurrent.atomic.AtomicLong;
 
 public class CopyJobSession extends AbstractJobSession {
 
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     private static CopyJobSession copyJobSession;
-
-    protected PreparedStatement astraInsertStatement;
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected AtomicLong readCounter = new AtomicLong(0);
     protected AtomicLong writeCounter = new AtomicLong(0);
 
-    protected List<MigrateDataType> insertColTypes = new ArrayList<MigrateDataType>();
-    protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
+    protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
+        super(sourceSession, astraSession, sparkConf);
+    }
 
     public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
         if (copyJobSession == null) {
@@ -38,39 +36,6 @@ public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession as
         return copyJobSession;
     }
 
-    protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
-        super(sourceSession, astraSession, sparkConf);
-
-        String insertCols = sparkConf.get("spark.query.cols.insert");
-        insertColTypes = getTypes(sparkConf.get("spark.query.cols.insert.types"));
-        String insertBinds = "";
-        int count = 1;
-        for (String str : insertCols.split(",")) {
-            if (count > 1) {
-                insertBinds = insertBinds + ",?";
-            } else {
-                insertBinds = insertBinds + "?";
-            }
-            count++;
-        }
-
-        if (isCounterTable) {
-            String updateSelectMappingStr = sparkConf.get("spark.counterTable.cql.index", "0");
-            for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
-                updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
-            }
-
-            String counterTableUpdate = sparkConf.get("spark.counterTable.cql");
-            astraInsertStatement = astraSession.prepare(counterTableUpdate);
-        } else {
-            if (isPreserveTTLWritetime) {
-                astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ") using TTL ? and TIMESTAMP ?");
-            } else {
-                astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")");
-            }
-        }
-    }
-
     public void getDataAndInsert(BigInteger min, BigInteger max) {
         logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
         int maxAttempts = maxRetries;
@@ -179,8 +144,8 @@ public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRo
         BoundStatement boundInsertStatement = insertStatement.bind();
 
         if (isCounterTable) {
-            for (int index = 0; index < insertColTypes.size(); index++) {
-                MigrateDataType dataType = insertColTypes.get(index);
+            for (int index = 0; index < selectColTypes.size(); index++) {
+                MigrateDataType dataType = selectColTypes.get(updateSelectMapping.get(index));
                 // compute the counter delta if reading from astra for the difference
                 if (astraRow != null && index < (selectColTypes.size() - idColTypes.size())) {
                     boundInsertStatement = boundInsertStatement.set(index, (sourceRow.getLong(updateSelectMapping.get(index)) - astraRow.getLong(updateSelectMapping.get(index))), Long.class);
@@ -190,8 +155,8 @@ public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRo
             }
         } else {
             int index = 0;
-            for (index = 0; index < insertColTypes.size(); index++) {
-                MigrateDataType dataTypeObj = insertColTypes.get(index);
+            for (index = 0; index < selectColTypes.size(); index++) {
+                MigrateDataType dataTypeObj = selectColTypes.get(index);
                 Class dataType = dataTypeObj.typeClass;
 
                 try {
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -22,14 +22,11 @@ spark.batchSize                                         5
 spark.coveragePercent                                   100
 spark.printStatsAfter                                   100000
 
-spark.query.cols.select                                 partition-key,clustering-key,order-date,amount,writetime(order-date),writetime(amount),ttl(order-date),ttl(amount)
-spark.diff.select.types                                 9,1,4,3
-spark.query.cols.id                                     partition-key,clustering-key
-spark.query.cols.id.types                               9,1
-spark.query.cols.partitionKey                           partition-key
-
-spark.query.cols.insert                                 partition-key,clustering-key,order-date,amount
-spark.query.cols.insert.types                           9,1,4,3
+spark.query.source                                      partition-key,clustering-key,order-date,amount,writetime(order-date),writetime(amount),ttl(order-date),ttl(amount)
+spark.query.source.partitionKey                         partition-key
+spark.query.destination                                 partition-key,clustering-key,order-date,amount
+spark.query.destination.id                              partition-key,clustering-key
+spark.query.types                                       9,1,4,3
 
 spark.counterTable                                      false
 spark.counterTable.cql