Merge pull request #11 from datastax/feature/logging-fix

pravinbhat · web-flow · commit 307c6709761c · 2022-10-07T07:53:11.000-04:00
Feature/logging fix
diff --git a/.idea/libraries/Maven__log4j_apache_log4j_extras_1_2_17.xml b/.idea/libraries/Maven__log4j_apache_log4j_extras_1_2_17.xml
diff --git a/.idea/libraries/Maven__log4j_log4j_1_2_17.xml b/.idea/libraries/Maven__log4j_log4j_1_2_17.xml
diff --git a/cassandra-data-migrator.iml b/cassandra-data-migrator.iml
@@ -83,7 +83,6 @@
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.16" level="project" />
     <orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.16" level="project" />
     <orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.16" level="project" />
-    <orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.7.16" level="project" />
     <orderEntry type="library" name="Maven: com.ning:compress-lzf:1.0.3" level="project" />
     <orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.8.2" level="project" />
@@ -160,7 +159,6 @@
     <orderEntry type="library" name="Maven: org.spark-project.hive:hive-exec:1.2.1.spark2" level="project" />
     <orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
     <orderEntry type="library" name="Maven: javolution:javolution:5.5.1" level="project" />
-    <orderEntry type="library" name="Maven: log4j:apache-log4j-extras:1.2.17" level="project" />
     <orderEntry type="library" name="Maven: org.antlr:ST4:4.0.4" level="project" />
     <orderEntry type="library" name="Maven: com.googlecode.javaewah:JavaEWAH:0.3.2" level="project" />
     <orderEntry type="library" name="Maven: org.iq80.snappy:snappy:0.2" level="project" />
@@ -202,6 +200,9 @@
     <orderEntry type="library" name="Maven: com.github.spotbugs:spotbugs-annotations:3.1.12" level="project" />
     <orderEntry type="library" name="Maven: com.datastax.oss:java-driver-mapper-runtime:4.10.0" level="project" />
     <orderEntry type="library" name="Maven: com.datastax.oss:java-driver-query-builder:4.10.0" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.19.0" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.19.0" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.19.0" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest_2.11:3.2.12" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest-core_2.11:3.2.12" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest-compatible:3.2.12" level="project" />
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>1.4</version>
+  <version>1.5</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -3,13 +3,11 @@
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.util.ArrayList;
 import java.util.List;
 
-public abstract  class BaseJobSession {
+public abstract class BaseJobSession {
 
     protected PreparedStatement sourceSelectStatement;
     protected PreparedStatement astraSelectStatement;
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -62,7 +62,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         }
 
                         writeLimiter.acquire(1);
-                        if (readCounter.incrementAndGet() % 1000 == 0) {
+                        if (readCounter.incrementAndGet() % printStatsAfter == 0) {
                             logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: "
                                     + readCounter.get());
                         }
@@ -88,7 +88,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                     for (Row sourceRow : resultSet) {
                         readLimiter.acquire(1);
                         writeLimiter.acquire(1);
-                        if (readCounter.incrementAndGet() % 1000 == 0) {
+                        if (readCounter.incrementAndGet() % printStatsAfter == 0) {
                             logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: " + readCounter.get());
                         }
                         batchStatement = batchStatement.add(bindInsert(astraInsertStatement, sourceRow, null));
@@ -133,7 +133,7 @@ private void iterateAndClearWriteResults(Collection<CompletionStage<AsyncResultS
         for (CompletionStage<AsyncResultSet> writeResult : writeResults) {
             //wait for the writes to complete for the batch. The Retry policy, if defined,  should retry the write on timeouts.
             writeResult.toCompletableFuture().get().one();
-            if (writeCounter.addAndGet(incrementBy) % 1000 == 0) {
+            if (writeCounter.addAndGet(incrementBy) % printStatsAfter == 0) {
                 logger.info("TreadID: " + Thread.currentThread().getId() + " Write Record Count: " + writeCounter.get());
             }
         }
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -16,9 +16,10 @@
 
 public class DiffJobSession extends CopyJobSession {
 
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     private static DiffJobSession diffJobSession;
-
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+    protected Boolean autoCorrectMissing = false;
+    protected Boolean autoCorrectMismatch = false;
     private AtomicLong readCounter = new AtomicLong(0);
     private AtomicLong mismatchCounter = new AtomicLong(0);
     private AtomicLong missingCounter = new AtomicLong(0);
@@ -27,8 +28,15 @@ public class DiffJobSession extends CopyJobSession {
     private AtomicLong validCounter = new AtomicLong(0);
     private AtomicLong skippedCounter = new AtomicLong(0);
 
-    protected Boolean autoCorrectMissing = false;
-    protected Boolean autoCorrectMismatch = false;
+    private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
+        super(sourceSession, astraSession, sparkConf);
+
+        autoCorrectMissing = Boolean.parseBoolean(sparkConf.get("spark.destination.autocorrect.missing", "false"));
+        logger.info("PARAM -- Autocorrect Missing: " + autoCorrectMissing);
+
+        autoCorrectMismatch = Boolean.parseBoolean(sparkConf.get("spark.destination.autocorrect.mismatch", "false"));
+        logger.info("PARAM -- Autocorrect Mismatch: " + autoCorrectMismatch);
+    }
 
     public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
         if (diffJobSession == null) {
@@ -42,13 +50,6 @@ public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession as
         return diffJobSession;
     }
 
-    private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
-        super(sourceSession, astraSession, sparkConf);
-
-        autoCorrectMissing = Boolean.parseBoolean(sparkConf.get("spark.destination.autocorrect.missing", "false"));
-        autoCorrectMismatch = Boolean.parseBoolean(sparkConf.get("spark.destination.autocorrect.mismatch", "false"));
-    }
-
     public void getDataAndDiff(BigInteger min, BigInteger max) {
         ForkJoinPool customThreadPool = new ForkJoinPool();
         logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
@@ -58,7 +59,7 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
             try {
                 // cannot do batching if the writeFilter is greater than 0
                 ResultSet resultSet = sourceSession.execute(
-                        sourceSelectStatement.bind(hasRandomPartitioner? min : min.longValueExact(), hasRandomPartitioner? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
+                        sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
 
                 customThreadPool.submit(() -> {
                     StreamSupport.stream(resultSet.spliterator(), true).forEach(sRow -> {
@@ -100,7 +101,7 @@ public void printCounts(String finalStr) {
                 + mismatchCounter.get());
         logger.info("TreadID: " + Thread.currentThread().getId() + " " + finalStr + " Corrected Mismatch Count: "
                 + correctedMismatchCounter.get());
-                logger.info("TreadID: " + Thread.currentThread().getId() + " " + finalStr + " Read Missing Count: "
+        logger.info("TreadID: " + Thread.currentThread().getId() + " " + finalStr + " Read Missing Count: "
                 + missingCounter.get());
         logger.info("TreadID: " + Thread.currentThread().getId() + " " + finalStr + " Corrected Missing Count: "
                 + correctedMissingCounter.get());
diff --git a/src/main/java/datastax/astra/migrate/SplitPartitions.java b/src/main/java/datastax/astra/migrate/SplitPartitions.java
@@ -12,70 +12,70 @@
 
 public class SplitPartitions {
 
-    public static Logger logger = LoggerFactory.getLogger(SplitPartitions.class.getName());
     public final static Long MIN_PARTITION = Long.MIN_VALUE;
-    public final static Long MAX_PARTITION  = Long.MAX_VALUE;
+    public final static Long MAX_PARTITION = Long.MAX_VALUE;
+    public static Logger logger = LoggerFactory.getLogger(SplitPartitions.class.getName());
 
-    public static void main(String[] args){
+    public static void main(String[] args) {
         Collection<Partition> partitions = getSubPartitions(new BigInteger("20"), BigInteger.valueOf(MIN_PARTITION),
                 BigInteger.valueOf(MAX_PARTITION), 20);
-        for(Partition partition: partitions){
+        for (Partition partition : partitions) {
             System.out.println(partition);
         }
     }
 
-    public static Collection<Partition> getRandomSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max, int coveragePercent){
+    public static Collection<Partition> getRandomSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max, int coveragePercent) {
         logger.info("TreadID: " + Thread.currentThread().getId() + " Splitting min: " + min + " max:" + max);
-        List<Partition> partitions = getSubPartitions(splitSize,min,max, coveragePercent);
+        List<Partition> partitions = getSubPartitions(splitSize, min, max, coveragePercent);
         Collections.shuffle(partitions);
         Collections.shuffle(partitions);
         Collections.shuffle(partitions);
         Collections.shuffle(partitions);
         return partitions;
     }
 
-    private static List<Partition> getSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max, int coveragePercent){
+    private static List<Partition> getSubPartitions(BigInteger splitSize, BigInteger min, BigInteger max, int coveragePercent) {
         if (coveragePercent < 1 || coveragePercent > 100) {
             coveragePercent = 100;
         }
         BigInteger curMax = new BigInteger(min.toString());
-        BigInteger partitionSize =  max.subtract(min).divide(splitSize);
+        BigInteger partitionSize = max.subtract(min).divide(splitSize);
         List<Partition> partitions = new ArrayList<Partition>();
-        if(partitionSize.compareTo(new BigInteger("0"))==0){
-            partitionSize=new BigInteger("100000");
+        if (partitionSize.compareTo(new BigInteger("0")) == 0) {
+            partitionSize = new BigInteger("100000");
         }
         boolean exausted = false;
-        while(curMax.compareTo(max) <=0){
+        while (curMax.compareTo(max) <= 0) {
             BigInteger curMin = new BigInteger(curMax.toString());
             BigInteger newCurMax = curMin.add(partitionSize);
             if (newCurMax.compareTo(curMax) == -1) {
                 newCurMax = new BigInteger(max.toString());
                 exausted = true;
             }
-            if (newCurMax.compareTo(max)==1){
+            if (newCurMax.compareTo(max) == 1) {
                 newCurMax = new BigInteger(max.toString());
-                exausted=true;
+                exausted = true;
             }
             curMax = newCurMax;
 
             BigInteger range = curMax.subtract(curMin);
             BigInteger curRange = range.multiply(BigInteger.valueOf(coveragePercent)).divide(BigInteger.valueOf(100));
-            partitions.add(new Partition(curMin,curMin.add(curRange)));
-            if(exausted){
+            partitions.add(new Partition(curMin, curMin.add(curRange)));
+            if (exausted) {
                 break;
             }
         }
 
         return partitions;
     }
 
-    public static class Partition implements Serializable{
+    public static class Partition implements Serializable {
         private static final long serialVersionUID = 1L;
 
         private BigInteger min;
         private BigInteger max;
 
-        public Partition(BigInteger min, BigInteger max){
+        public Partition(BigInteger min, BigInteger max) {
             this.min = min;
             this.max = max;
         }
@@ -88,8 +88,8 @@ public BigInteger getMax() {
             return max;
         }
 
-        public String toString(){
-            return "Processing partition for token range "+ min + " to " + max;
+        public String toString() {
+            return "Processing partition for token range " + min + " to " + max;
         }
     }
 }
diff --git a/src/main/scala/datastax/astra/migrate/AbstractJob.scala b/src/main/scala/datastax/astra/migrate/AbstractJob.scala
@@ -1,14 +1,15 @@
 package datastax.astra.migrate
 
 import com.datastax.spark.connector.cql.CassandraConnector
-import org.apache.spark.sql.SparkSession
-import org.slf4j.LoggerFactory
-
-import java.math.BigInteger
-import java.lang.Long
 
 class AbstractJob extends BaseJob {
 
+  abstractLogger.info("PARAM -- Min Partition: " + minPartition)
+  abstractLogger.info("PARAM -- Max Partition: " + maxPartition)
+  abstractLogger.info("PARAM -- Split Size: " + coveragePercent)
+  abstractLogger.info("PARAM -- Coverage Percent: " + coveragePercent)
+  abstractLogger.info("PARAM Calculated -- Total Partitions: " + partitions.size())
+
   var sourceConnection = getConnection(true, sourceIsAstra, sourceScbPath, sourceHost, sourceUsername, sourcePassword, sourceReadConsistencyLevel,
     sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms);
 
diff --git a/src/main/scala/datastax/astra/migrate/BaseJob.scala b/src/main/scala/datastax/astra/migrate/BaseJob.scala
@@ -41,11 +41,11 @@ class BaseJob extends App {
   val destinationKeyStorePassword = sc.getConf.get("spark.destination.keyStore.password", "")
   val destinationEnabledAlgorithms = sc.getConf.get("spark.destination.enabledAlgorithms", "")
 
-  val minPartition = new BigInteger(sc.getConf.get("spark.source.minPartition","-9223372036854775808"))
-  val maxPartition = new BigInteger(sc.getConf.get("spark.source.maxPartition","9223372036854775807"))
+  val minPartition = new BigInteger(sc.getConf.get("spark.source.minPartition", "-9223372036854775808"))
+  val maxPartition = new BigInteger(sc.getConf.get("spark.source.maxPartition", "9223372036854775807"))
   val coveragePercent = sc.getConf.get("spark.coveragePercent", "100")
   val splitSize = sc.getConf.get("spark.splitSize", "10000")
-  val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition,Integer.parseInt(coveragePercent))
+  val partitions = SplitPartitions.getRandomSubPartitions(BigInteger.valueOf(Long.parseLong(splitSize)), minPartition, maxPartition, Integer.parseInt(coveragePercent))
 
   protected def exitSpark() = {
     spark.stop()
diff --git a/src/main/scala/datastax/astra/migrate/DiffData.scala b/src/main/scala/datastax/astra/migrate/DiffData.scala
@@ -15,7 +15,7 @@ object DiffData extends AbstractJob {
   exitSpark
 
   private def diffTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = {
-    val parts = sc.parallelize(partitions.toSeq,partitions.size);
+    val parts = sc.parallelize(partitions.toSeq, partitions.size);
     logger.info("Spark parallelize created : " + parts.count() + " parts!");
 
     parts.foreach(part => {
diff --git a/src/main/scala/datastax/astra/migrate/Migrate.scala b/src/main/scala/datastax/astra/migrate/Migrate.scala
@@ -17,13 +17,13 @@ object Migrate extends AbstractJob {
   exitSpark
 
   private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = {
-    val parts = sc.parallelize(partitions.toSeq,partitions.size);
+    val parts = sc.parallelize(partitions.toSeq, partitions.size);
     logger.info("Spark parallelize created : " + parts.count() + " parts!");
 
     parts.foreach(part => {
       sourceConnection.withSessionDo(sourceSession =>
         destinationConnection.withSessionDo(destinationSession =>
-          CopyJobSession.getInstance(sourceSession,destinationSession, sc.getConf)
+          CopyJobSession.getInstance(sourceSession, destinationSession, sc.getConf)
             .getDataAndInsert(part.getMin, part.getMax)))
     })
 

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`writeLimiter.acquire(1);`
`65`		`- if (readCounter.incrementAndGet() % 1000 == 0) {`
	`65`	`+ if (readCounter.incrementAndGet() % printStatsAfter == 0) {`
`66`	`66`	`logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: "`
`67`	`67`	`+ readCounter.get());`
`68`	`68`	`}`
`@@ -88,7 +88,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {`
`88`	`88`	`for (Row sourceRow : resultSet) {`
`89`	`89`	`readLimiter.acquire(1);`
`90`	`90`	`writeLimiter.acquire(1);`
`91`		`- if (readCounter.incrementAndGet() % 1000 == 0) {`
	`91`	`+ if (readCounter.incrementAndGet() % printStatsAfter == 0) {`
`92`	`92`	`logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: " + readCounter.get());`
`93`	`93`	`}`
`94`	`94`	`batchStatement = batchStatement.add(bindInsert(astraInsertStatement, sourceRow, null));`
`@@ -133,7 +133,7 @@ private void iterateAndClearWriteResults(Collection<CompletionStage<AsyncResultS`
`133`	`133`	`for (CompletionStage<AsyncResultSet> writeResult : writeResults) {`
`134`	`134`	`//wait for the writes to complete for the batch. The Retry policy, if defined, should retry the write on timeouts.`
`135`	`135`	`writeResult.toCompletableFuture().get().one();`
`136`		`- if (writeCounter.addAndGet(incrementBy) % 1000 == 0) {`
	`136`	`+ if (writeCounter.addAndGet(incrementBy) % printStatsAfter == 0) {`
`137`	`137`	`logger.info("TreadID: " + Thread.currentThread().getId() + " Write Record Count: " + writeCounter.get());`
`138`	`138`	`}`
`139`	`139`	`}`