Merge pull request #18 from Ankitp1342/feature/batch-size-one-fix

Ankitp1342 · web-flow · commit b02113ea3a08 · 2022-07-21T09:44:52.000-04:00
Fixed bug with batch-size one &amp; some logging improvements
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>com.datastax.spark.example</groupId>
   <artifactId>migrate</artifactId>
-  <version>0.11</version>
+  <version>0.12</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -39,7 +39,8 @@ public abstract class AbstractJobSession {
     protected Integer batchSize = 1;
     protected Integer printStatsAfter = 100000;
 
-    protected Boolean writeTimeStampFilter = false;
+    protected Boolean isPreserveTTLWritetime = Boolean.FALSE;
+    protected Boolean writeTimeStampFilter = Boolean.FALSE;
     protected Long minWriteTimeStampFilter = 0l;
     protected Long maxWriteTimeStampFilter = Long.MAX_VALUE;
 
@@ -51,11 +52,9 @@ public abstract class AbstractJobSession {
     protected String sourceKeyspaceTable;
     protected String astraKeyspaceTable;
 
-
     protected Boolean hasRandomPartitioner;
 
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
-
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
@@ -72,22 +71,43 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         sourceKeyspaceTable = sparkConf.get("spark.migrate.source.keyspaceTable");
         astraKeyspaceTable = sparkConf.get("spark.migrate.astra.keyspaceTable");
 
+        isPreserveTTLWritetime = Boolean.parseBoolean(sparkConf.get("spark.migrate.preserveTTLWriteTime", "false"));
+        if (isPreserveTTLWritetime) {
+            String ttlColsStr = sparkConf.get("spark.migrate.source.ttl.cols");
+            if (null != ttlColsStr && ttlColsStr.trim().length() > 0) {
+                for (String ttlCol : ttlColsStr.split(",")) {
+                    ttlCols.add(Integer.parseInt(ttlCol));
+                }
+            }
+        }
+
         writeTimeStampFilter = Boolean
                 .parseBoolean(sparkConf.get("spark.migrate.source.writeTimeStampFilter", "false"));
-        minWriteTimeStampFilter = new Long(
-                sparkConf.get("spark.migrate.source.minWriteTimeStampFilter", "0"));
-        maxWriteTimeStampFilter = new Long(
-                sparkConf.get("spark.migrate.source.maxWriteTimeStampFilter", "" + Long.MAX_VALUE));
         // batchsize set to 1 if there is a writeFilter
         if (writeTimeStampFilter) {
             batchSize = 1;
+            String writeTimestampColsStr = sparkConf.get("spark.migrate.source.writeTimeStampFilter.cols");
+            if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
+                for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
+                    writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
+                }
+            }
         }
+
+        minWriteTimeStampFilter = new Long(
+                sparkConf.get("spark.migrate.source.minWriteTimeStampFilter", "0"));
+        maxWriteTimeStampFilter = new Long(
+                sparkConf.get("spark.migrate.source.maxWriteTimeStampFilter", "" + Long.MAX_VALUE));
+
         logger.info(" DEFAULT -- Write Batch Size: " + batchSize);
         logger.info(" DEFAULT -- Source Keyspace Table: " + sourceKeyspaceTable);
         logger.info(" DEFAULT -- Astra Keyspace Table: " + astraKeyspaceTable);
         logger.info(" DEFAULT -- ReadRateLimit: " + readLimiter.getRate());
         logger.info(" DEFAULT -- WriteRateLimit: " + writeLimiter.getRate());
         logger.info(" DEFAULT -- WriteTimestampFilter: " + writeTimeStampFilter);
+        logger.info(" DEFAULT -- WriteTimestampFilterCols: " + writeTimeStampCols);
+        logger.info(" DEFAULT -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
+        logger.info(" DEFAULT -- TTLCols: " + ttlCols);
 
         hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.migrate.source.hasRandomPartitioner", "false"));
 
@@ -96,20 +116,6 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         counterDeltaMaxIndex = Integer
                 .parseInt(sparkConf.get("spark.migrate.source.counterTable.update.max.counter.index", "0"));
 
-        String writeTimestampColsStr = sparkConf.get("spark.migrate.source.writeTimeStampFilter.cols");
-        if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
-            for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
-                writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
-            }
-        }
-
-        String ttlColsStr = sparkConf.get("spark.migrate.source.ttl.cols");
-        if (null != ttlColsStr && ttlColsStr.trim().length() > 0) {
-            for (String ttlCol : ttlColsStr.split(",")) {
-                ttlCols.add(Integer.parseInt(ttlCol));
-            }
-        }
-
         String partionKey = sparkConf.get("spark.migrate.query.cols.partitionKey");
         String idCols = sparkConf.get("spark.migrate.query.cols.id");
         idColTypes = getTypes(sparkConf.get("spark.migrate.query.cols.id.types"));
@@ -128,15 +134,13 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         }
 
         sourceSelectCondition = sparkConf.get("spark.migrate.query.cols.select.condition", "");
-
         sourceSelectStatement = sourceSession.prepare(
                 "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
                         + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
 
         astraSelectStatement = astraSession.prepare(
                 "select " + selectCols + " from " + astraKeyspaceTable
                         + " where " + idBinds);
-
     }
 
     public List<MigrateDataType> getTypes(String types) {
@@ -146,7 +150,6 @@ public List<MigrateDataType> getTypes(String types) {
         }
 
         return dataTypes;
-
     }
 
     public int getLargestTTL(Row sourceRow) {
@@ -177,7 +180,6 @@ public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sou
     }
 
     public Object getData(MigrateDataType dataType, int index, Row sourceRow) {
-
         if (dataType.typeClass == Map.class) {
             return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
         } else if (dataType.typeClass == List.class) {
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -24,7 +24,6 @@ public class CopyJobSession extends AbstractJobSession {
 
     protected List<MigrateDataType> insertColTypes = new ArrayList<MigrateDataType>();
     protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
-    protected Boolean isPreserveTTLWritetime = Boolean.FALSE;
 
     public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
         if (copyJobSession == null) {
@@ -53,26 +52,22 @@ protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, Spar
             }
             count++;
         }
-        isPreserveTTLWritetime = Boolean.parseBoolean(sparkConf.get("spark.migrate.preserveTTLWriteTime", "false"));
 
         if (isCounterTable) {
             String updateSelectMappingStr = sparkConf.get("spark.migrate.source.counterTable.update.select.index", "0");
             for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
                 updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
             }
 
-
             String counterTableUpdate = sparkConf.get("spark.migrate.source.counterTable.update.cql");
             astraInsertStatement = astraSession.prepare(counterTableUpdate);
         } else {
-
             if (isPreserveTTLWritetime) {
                 astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ") using TTL ? and TIMESTAMP ?");
             } else {
                 astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")");
             }
         }
-
     }
 
     public void getDataAndInsert(BigInteger min, BigInteger max) {
@@ -81,9 +76,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
         for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
 
             try {
-
                 ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()));
-
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
 
                 // cannot do batching if the writeFilter is greater than 0 or
@@ -99,7 +92,6 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                                     || sourceWriteTimeStamp > maxWriteTimeStampFilter) {
                                 continue;
                             }
-
                         }
 
                         writeLimiter.acquire(1);
@@ -118,41 +110,33 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                             CompletionStage<AsyncResultSet> astraWriteResultSet = astraSession
                                     .executeAsync(bindInsert(astraInsertStatement, sourceRow, astraRow));
                             writeResults.add(astraWriteResultSet);
-
                         } else {
                             CompletionStage<AsyncResultSet> astraWriteResultSet = astraSession
                                     .executeAsync(bindInsert(astraInsertStatement, sourceRow));
                             writeResults.add(astraWriteResultSet);
                         }
-
                         if (writeResults.size() > 1000) {
                             iterateAndClearWriteResults(writeResults, 1);
                         }
                     }
 
                     // clear the write resultset in-case it didnt mod at 1000 above
                     iterateAndClearWriteResults(writeResults, 1);
-
                 } else {
-                    //
                     BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
                     for (Row row : resultSet) {
                         readLimiter.acquire(1);
                         writeLimiter.acquire(1);
                         if (readCounter.incrementAndGet() % 1000 == 0) {
                             logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: " + readCounter.get());
                         }
-
                         batchStatement = batchStatement.add(bindInsert(astraInsertStatement, row));
 
-
                         // if batch threshold is met, send the writes and clear the batch
                         if (batchStatement.size() >= batchSize) {
-
                             CompletionStage<AsyncResultSet> writeResultSet = astraSession.executeAsync(batchStatement);
                             writeResults.add(writeResultSet);
                             batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
-
                         }
 
                         if (writeResults.size() * batchSize > 1000) {
@@ -163,7 +147,6 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                     // clear the write resultset in-case it didnt mod at 1000 above
                     iterateAndClearWriteResults(writeResults, batchSize);
 
-
                     // if there are any pending writes because the batchSize threshold was not met, then write and clear them
                     if (batchStatement.size() > 0) {
                         CompletionStage<AsyncResultSet> writeResultSet = astraSession.executeAsync(batchStatement);
@@ -187,7 +170,6 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
 
     }
 
-
     private void iterateAndClearWriteResults(Collection<CompletionStage<AsyncResultSet>> writeResults, int incrementBy) throws Exception{
         for (CompletionStage<AsyncResultSet> writeResult : writeResults) {
             //wait for the writes to complete for the batch. The Retry policy, if defined,  should retry the write on timeouts.
@@ -199,8 +181,6 @@ private void iterateAndClearWriteResults(Collection<CompletionStage<AsyncResultS
         writeResults.clear();
     }
 
-
-
     public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow) {
         return bindInsert(insertStatement, sourceRow, null);
     }
diff --git a/src/main/scala/datastax/astra/migrate/DiffData.scala b/src/main/scala/datastax/astra/migrate/DiffData.scala
@@ -5,6 +5,7 @@ import com.datastax.oss.driver.api.core.metadata.schema.TableMetadata
 import com.datastax.spark.connector._
 import com.datastax.spark.connector.cql.CassandraConnector
 import datastax.astra.migrate.Migrate.{astraPassword, astraReadConsistencyLevel, astraScbPath, astraUsername, sc, sourceHost, sourcePassword, sourceReadConsistencyLevel, sourceUsername}
+import org.apache.log4j.Logger
 import org.apache.spark.sql.{SaveMode, SparkSession}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.cassandra._
@@ -17,6 +18,8 @@ import java.math.BigInteger
 
 object DiffData extends App {
 
+  val logger = Logger.getLogger(this.getClass.getName)
+
   val spark = SparkSession.builder
     .appName("Datastax Data Validation")
     .getOrCreate()
@@ -46,7 +49,7 @@ object DiffData extends App {
   val splitSize = sc.getConf.get("spark.migrate.splitSize","10000")
 
 
-  println("Started Data Validation App")
+  logger.info("Started Data Validation App")
 
   val isBeta = sc.getConf.get("spark.migrate.beta","false")
   val isCassandraToCassandra = sc.getConf.get("spark.migrate.ctoc", "false")
@@ -90,6 +93,8 @@ object DiffData extends App {
   private def diffTable(sourceConnection: CassandraConnector, astraConnection: CassandraConnector, minPartition:BigInteger, maxPartition:BigInteger) = {
     val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition)
     val parts = sc.parallelize(partitions.toSeq,partitions.size);
+
+    logger.info("Spark parallelize created : " + parts.count() + " parts!");
     parts.foreach(part => {
       sourceConnection.withSessionDo(sourceSession => 
         astraConnection.withSessionDo(astraSession => 
diff --git a/src/main/scala/datastax/astra/migrate/Migrate.scala b/src/main/scala/datastax/astra/migrate/Migrate.scala
@@ -4,6 +4,7 @@ import com.datastax.oss.driver.api.core.{CqlIdentifier, CqlSession}
 import com.datastax.oss.driver.api.core.metadata.schema.TableMetadata
 import com.datastax.spark.connector._
 import com.datastax.spark.connector.cql.CassandraConnector
+import org.apache.log4j.Logger
 import org.apache.spark.sql.{SaveMode, SparkSession}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.cassandra._
@@ -18,6 +19,8 @@ import collection.JavaConversions._
 // http://www.russellspitzer.com/2016/02/16/Multiple-Clusters-SparkSql-Cassandra/
 
 object Migrate extends App {
+  val logger = Logger.getLogger(this.getClass.getName)
+
   val spark = SparkSession.builder
     .appName("Datastax Data Migration")
     .getOrCreate()
@@ -46,7 +49,7 @@ object Migrate extends App {
   val astraReadConsistencyLevel = sc.getConf.get("spark.cassandra.astra.read.consistency.level","LOCAL_QUORUM")
 
 
-  println("Started Migration App")
+  logger.info("Started Migration App")
 
   val isBeta = sc.getConf.get("spark.migrate.beta","false")
 
@@ -87,6 +90,7 @@ object Migrate extends App {
 
     val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition)
     val parts = sc.parallelize(partitions.toSeq,partitions.size);
+    logger.info("Spark parallelize created : " + parts.count() + " parts!");
     parts.foreach(part => {
       sourceConnection.withSessionDo(sourceSession => astraConnection.withSessionDo(astraSession=>   CopyJobSession.getInstance(sourceSession,astraSession, sc.getConf).getDataAndInsert(part.getMin, part.getMax)))
     })