Merge pull request #23 from datastax/feature/udt-diff-fix

pravinbhat · web-flow · commit cda595b6fbac · 2022-11-15T07:18:09.000-05:00
DiffData UDT fix
diff --git a/README.md b/README.md
@@ -4,6 +4,12 @@ Spark jobs in this repo can be used for data migration and data validation.
 
 > :warning: Please note this job has been tested with spark version [2.4.8](https://archive.apache.org/dist/spark/spark-2.4.8/)
 
+## Build
+1. Clone this repo
+2. Move to the repo folder `cd cassandra-data-migrator`
+3. Run the build `mvn clean package`
+4. The fat jar (`cassandra-data-migrator-2.x.jar`) file should now be present in the `target` folder
+
 ## Prerequisite
 
 Install Java8 as spark binaries are compiled with it.
@@ -19,13 +25,12 @@ tar -xvzf <spark downloaded file name>
 1. `sparkConf.properties` file needs to be configured as applicable for the environment
    > A sample Spark conf file configuration can be [found here](./src/resources/sparkConf.properties)
 2. Place the conf file where it can be accessed while running the job via spark-submit.
-3. Generate a fat jar (`cassandra-data-migrator-1.x.jar`) using command `mvn clean package`
-4. Run the 'Data Migration' job using `spark-submit` command as shown below:
+3. Run the 'Data Migration' job using `spark-submit` command as shown below:
 
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.Migrate cassandra-data-migrator-1.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.Migrate cassandra-data-migrator-2.x.jar &> logfile_name.txt
 ```
 
 Note: Above command also generates a log file `logfile_name.txt` to avoid log output on the console.
@@ -38,7 +43,7 @@ Note: Above command also generates a log file `logfile_name.txt` to avoid log ou
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.DiffData cassandra-data-migrator-1.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.DiffData cassandra-data-migrator-2.x.jar &> logfile_name.txt
 ```
 
 - Validation job will report differences as “ERRORS” in the log file as shown below
@@ -67,7 +72,7 @@ spark.target.autocorrect.mismatch                   true|false
 ```
 ./spark-submit --properties-file sparkConf.properties /
 --master "local[*]" /
---class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-1.x.jar &> logfile_name.txt
+--class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-2.x.jar &> logfile_name.txt
 ```
 
 When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder in the below format, where each line (`min,max`) represents a partition-range 
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>2.3</version>
+  <version>2.4</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -5,6 +5,7 @@
 import com.datastax.oss.driver.api.core.cql.AsyncResultSet;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
+import com.datastax.oss.driver.api.core.data.UdtValue;
 import org.apache.spark.SparkConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -172,7 +173,15 @@ private String isDifferent(Row sourceRow, Row astraRow) {
 
             boolean isDiff = dataType.diff(source, astra);
             if (isDiff) {
-                diffData.append("(Index: " + index + " Origin: " + source + " Target: " + astra + " ) ");
+                if (dataType.typeClass.equals(UdtValue.class)) {
+                    String sourceUdtContent = ((UdtValue) source).getFormattedContents();
+                    String astraUdtContent = ((UdtValue) astra).getFormattedContents();
+                    if (!sourceUdtContent.equals(astraUdtContent)) {
+                        diffData.append("(Index: " + index + " Origin: " + sourceUdtContent + " Target: " + astraUdtContent + ") ");
+                    }
+                } else {
+                    diffData.append("(Index: " + index + " Origin: " + source + " Target: " + astra + ") ");
+                }
             }
         });
 
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -16,9 +16,9 @@
 import java.util.concurrent.CompletionStage;
 import java.util.concurrent.atomic.AtomicLong;
 
-public class OriginCountJobSession extends BaseJobSession{
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+public class OriginCountJobSession extends BaseJobSession {
     private static OriginCountJobSession originCountJobSession;
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected AtomicLong readCounter = new AtomicLong(0);
     protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
     protected Boolean checkTableforColSize;
@@ -28,17 +28,6 @@ public class OriginCountJobSession extends BaseJobSession{
     protected Integer filterColIndex;
     protected Integer fieldGuardraillimitMB;
     protected List<MigrateDataType> checkTableforColSizeTypes = new ArrayList<MigrateDataType>();
-    public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) {
-        if (originCountJobSession == null) {
-            synchronized (OriginCountJobSession.class) {
-                if (originCountJobSession == null) {
-                    originCountJobSession = new OriginCountJobSession(sourceSession, sparkConf);
-                }
-            }
-        }
-
-        return originCountJobSession;
-    }
 
     protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
         this.sourceSession = sourceSession;
@@ -59,8 +48,8 @@ protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
         checkTableforColSizeTypes = getTypes(sparkConf.get("spark.origin.checkTableforColSize.cols.types"));
         filterColName = sparkConf.get("spark.origin.FilterColumn");
         filterColType = sparkConf.get("spark.origin.FilterColumnType");
-        filterColIndex =  Integer.parseInt(sparkConf.get("spark.origin.FilterColumnIndex", "0"));
-        fieldGuardraillimitMB =   Integer.parseInt(sparkConf.get("spark.fieldGuardraillimitMB", "0"));
+        filterColIndex = Integer.parseInt(sparkConf.get("spark.origin.FilterColumnIndex", "0"));
+        fieldGuardraillimitMB = Integer.parseInt(sparkConf.get("spark.fieldGuardraillimitMB", "0"));
 
         String partionKey = sparkConf.get("spark.query.cols.partitionKey");
         idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
@@ -77,6 +66,18 @@ protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
 
     }
 
+    public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) {
+        if (originCountJobSession == null) {
+            synchronized (OriginCountJobSession.class) {
+                if (originCountJobSession == null) {
+                    originCountJobSession = new OriginCountJobSession(sourceSession, sparkConf);
+                }
+            }
+        }
+
+        return originCountJobSession;
+    }
+
     public void getData(BigInteger min, BigInteger max) {
         logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
         int maxAttempts = maxRetries;
@@ -93,7 +94,7 @@ public void getData(BigInteger min, BigInteger max) {
                     for (Row sourceRow : resultSet) {
                         readLimiter.acquire(1);
 
-                        if(checkTableforColSize) {
+                        if (checkTableforColSize) {
                             int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex);
                             String result = "";
                             if (rowColcnt > fieldGuardraillimitMB * 1048576) {
@@ -115,7 +116,7 @@ public void getData(BigInteger min, BigInteger max) {
                         readLimiter.acquire(1);
                         writeLimiter.acquire(1);
 
-                        if(checkTableforColSize) {
+                        if (checkTableforColSize) {
                             int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex);
                             String result = "";
                             if (rowColcnt > fieldGuardraillimitMB * 1048576) {
diff --git a/src/main/scala/datastax/astra/migrate/OriginData.scala b/src/main/scala/datastax/astra/migrate/OriginData.scala
@@ -5,7 +5,7 @@ import org.slf4j.LoggerFactory
 
 import scala.collection.JavaConversions._
 
-object OriginData extends  BaseJob {
+object OriginData extends BaseJob {
 
   val logger = LoggerFactory.getLogger(this.getClass.getName)
   logger.info("Started Migration App")
@@ -65,7 +65,7 @@ object OriginData extends  BaseJob {
   private def analyzeSourceTable(sourceConnection: CassandraConnector) = {
     val partitions = SplitPartitions.getRandomSubPartitions(splitSize, minPartition, maxPartition, Integer.parseInt(coveragePercent))
     logger.info("PARAM Calculated -- Total Partitions: " + partitions.size())
-    val parts = sContext.parallelize(partitions.toSeq,partitions.size);
+    val parts = sContext.parallelize(partitions.toSeq, partitions.size);
     logger.info("Spark parallelize created : " + parts.count() + " parts!");
 
     parts.foreach(part => {