Merge pull request #12 from datastax/feature/ttl-writetime-fix

pravinbhat · web-flow · commit befca0586af9 · 2022-10-11T09:08:33.000-04:00
Feature/ttl writetime fix
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Spark jobs in this repo can be used for data migration and data validation.
 
-> :warning: Please note this job has been tested with spark version [2.4.8](https://downloads.apache.org/spark/spark-2.4.8/)
+> :warning: Please note this job has been tested with spark version [2.4.8](https://archive.apache.org/dist/spark/spark-2.4.8/)
 
 ## Prerequisite
 
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>1.5</version>
+  <version>1.6</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -17,7 +17,7 @@
 public class AbstractJobSession extends BaseJobSession {
 
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-    
+
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
@@ -37,25 +37,26 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
 
         isPreserveTTLWritetime = Boolean.parseBoolean(sparkConf.get("spark.preserveTTLWriteTime", "false"));
         if (isPreserveTTLWritetime) {
-            String ttlColsStr = sparkConf.get("spark.source.ttl.cols");
+            String ttlColsStr = sparkConf.get("spark.preserveTTLWriteTime.ttl.cols");
             if (null != ttlColsStr && ttlColsStr.trim().length() > 0) {
                 for (String ttlCol : ttlColsStr.split(",")) {
                     ttlCols.add(Integer.parseInt(ttlCol));
                 }
             }
+
+            String writeTimestampColsStr = sparkConf.get("spark.preserveTTLWriteTime.writetime.cols");
+            if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
+                for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
+                    writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
+                }
+            }
         }
 
         writeTimeStampFilter = Boolean
                 .parseBoolean(sparkConf.get("spark.source.writeTimeStampFilter", "false"));
         // batchsize set to 1 if there is a writeFilter
         if (writeTimeStampFilter) {
             batchSize = 1;
-            String writeTimestampColsStr = sparkConf.get("spark.source.writeTimeStampFilter.cols");
-            if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
-                for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
-                    writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
-                }
-            }
         }
 
         String minWriteTimeStampFilterStr =
@@ -82,14 +83,27 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
 
         String selectCols = sparkConf.get("spark.query.source");
         String partionKey = sparkConf.get("spark.query.source.partitionKey");
+        String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
+
+        final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
+        if (isPreserveTTLWritetime) {
+            String[] allCols = selectCols.split(",");
+            ttlCols.forEach(col -> {
+                selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")");
+            });
+            writeTimeStampCols.forEach(col -> {
+                selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")");
+            });
+        }
+        String fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols.toString() + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
+                + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING";
+        sourceSelectStatement = sourceSession.prepare(fullSelectQuery);
+        logger.info("PARAM -- Query used: " + fullSelectQuery);
+
         selectColTypes = getTypes(sparkConf.get("spark.query.types"));
         String idCols = sparkConf.get("spark.query.destination.id", "");
         idColTypes = selectColTypes.subList(0, idCols.split(",").length);
-        String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
-        sourceSelectStatement = sourceSession.prepare(
-                "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
-                        + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
-
+        
         String insertCols = sparkConf.get("spark.query.destination", "");
         if (null == insertCols || insertCols.trim().isEmpty()) {
             insertCols = selectCols;
@@ -146,15 +160,15 @@ public List<MigrateDataType> getTypes(String types) {
     public int getLargestTTL(Row sourceRow) {
         int ttl = 0;
         for (Integer ttlCol : ttlCols) {
-            ttl = Math.max(ttl, sourceRow.getInt(ttlCol));
+            ttl = Math.max(ttl, sourceRow.getInt(selectColTypes.size() + ttlCol - 1));
         }
         return ttl;
     }
 
     public long getLargestWriteTimeStamp(Row sourceRow) {
         long writeTimestamp = 0;
         for (Integer writeTimeStampCol : writeTimeStampCols) {
-            writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(writeTimeStampCol));
+            writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(selectColTypes.size() + ttlCols.size() + writeTimeStampCol - 1));
         }
         return writeTimestamp;
     }
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -22,7 +22,7 @@ spark.batchSize                                         5
 spark.coveragePercent                                   100
 spark.printStatsAfter                                   100000
 
-spark.query.source                                      partition-key,clustering-key,order-date,amount,writetime(order-date),writetime(amount),ttl(order-date),ttl(amount)
+spark.query.source                                      partition-key,clustering-key,order-date,amount
 spark.query.source.partitionKey                         partition-key
 spark.query.destination                                 partition-key,clustering-key,order-date,amount
 spark.query.destination.id                              partition-key,clustering-key
@@ -33,10 +33,10 @@ spark.counterTable.cql
 spark.counterTable.cql.index                            0
 
 spark.preserveTTLWriteTime                              true
-spark.source.ttl.cols                                   6,7
+spark.preserveTTLWriteTime.ttl.cols                     2,3
+spark.preserveTTLWriteTime.writetime.cols               2,3
 
 spark.source.writeTimeStampFilter                       false
-spark.source.writeTimeStampFilter.cols                  4,5
 spark.source.minWriteTimeStampFilter                    0
 spark.source.maxWriteTimeStampFilter                    9223372036854775807