Merge pull request #36 from datastax/feature/filter-exclustion-copyjob

mfmaher2 · web-flow · commit 869b8fc0087d · 2022-11-30T07:26:26.000-05:00
Added filtering code to CopyJob.
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>2.6</version>
+  <version>2.7</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -48,7 +48,25 @@ public abstract class BaseJobSession {
     protected String astraKeyspaceTable;
 
     protected Boolean hasRandomPartitioner;
+    protected Boolean filterData;
+    protected String filterColName;
+    protected String filterColType;
+    protected Integer filterColIndex;
+    protected String filterColValue;
 
+    public String getKey(Row sourceRow) {
+        StringBuffer key = new StringBuffer();
+        for (int index = 0; index < idColTypes.size(); index++) {
+            MigrateDataType dataType = idColTypes.get(index);
+            if (index == 0) {
+                key.append(getData(dataType, index, sourceRow));
+            } else {
+                key.append(" %% " + getData(dataType, index, sourceRow));
+            }
+        }
+
+        return key.toString();
+    }
     public List<MigrateDataType> getTypes(String types) {
         List<MigrateDataType> dataTypes = new ArrayList<MigrateDataType>();
         for (String type : types.split(",")) {
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -22,6 +22,11 @@ public class CopyJobSession extends AbstractJobSession {
 
     protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) {
         super(sourceSession, astraSession, sc);
+        filterData = Boolean.parseBoolean(sc.get("spark.origin.FilterData", "false"));
+        filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn");
+        filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType");
+        filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0"));
+        filterColValue = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnValue");
     }
 
     public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) {
@@ -52,6 +57,14 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                     for (Row sourceRow : resultSet) {
                         readLimiter.acquire(1);
 
+                        if (filterData) {
+                            String col = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow);
+                            if (col.trim().equalsIgnoreCase(filterColValue)) {
+                                logger.warn("Skipping row and filtering out: " + getKey(sourceRow));
+                                skippedCounter.incrementAndGet();
+                                continue;
+                            }
+                        }
                         if (writeTimeStampFilter) {
                             // only process rows greater than writeTimeStampFilter
                             Long sourceWriteTimeStamp = getLargestWriteTimeStamp(sourceRow);
@@ -92,6 +105,16 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         if (readCounter.incrementAndGet() % printStatsAfter == 0) {
                             printCounts(false);
                         }
+
+                        if (filterData) {
+                            String colValue = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow);
+                            if (colValue.trim().equalsIgnoreCase(filterColValue)) {
+                                logger.warn("Skipping row and filtering out: " + getKey(sourceRow));
+                                skippedCounter.incrementAndGet();
+                                continue;
+                            }
+                        }
+
                         batchStatement = batchStatement.add(bindInsert(astraInsertStatement, sourceRow, null));
 
                         // if batch threshold is met, send the writes and clear the batch
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -186,18 +186,4 @@ private String isDifferent(Row sourceRow, Row astraRow) {
         return diffData.toString();
     }
 
-    private String getKey(Row sourceRow) {
-        StringBuffer key = new StringBuffer();
-        for (int index = 0; index < idColTypes.size(); index++) {
-            MigrateDataType dataType = idColTypes.get(index);
-            if (index == 0) {
-                key.append(getData(dataType, index, sourceRow));
-            } else {
-                key.append(" %% " + getData(dataType, index, sourceRow));
-            }
-        }
-
-        return key.toString();
-    }
-
 }
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -23,9 +23,6 @@ public class OriginCountJobSession extends BaseJobSession {
     protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
     protected Boolean checkTableforColSize;
     protected String checkTableforselectCols;
-    protected String filterColName;
-    protected String filterColType;
-    protected Integer filterColIndex;
     protected Integer fieldGuardraillimitMB;
     protected List<MigrateDataType> checkTableforColSizeTypes = new ArrayList<MigrateDataType>();
 
@@ -46,8 +43,8 @@ protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
         checkTableforColSize = Boolean.parseBoolean(sparkConf.get("spark.origin.checkTableforColSize", "false"));
         checkTableforselectCols = sparkConf.get("spark.origin.checkTableforColSize.cols");
         checkTableforColSizeTypes = getTypes(sparkConf.get("spark.origin.checkTableforColSize.cols.types"));
-        filterColName = sparkConf.get("spark.origin.FilterColumn");
-        filterColType = sparkConf.get("spark.origin.FilterColumnType");
+        filterColName = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumn");
+        filterColType = Util.getSparkPropOrEmpty(sparkConf, "spark.origin.FilterColumnType");
         filterColIndex = Integer.parseInt(sparkConf.get("spark.origin.FilterColumnIndex", "0"));
         fieldGuardraillimitMB = Integer.parseInt(sparkConf.get("spark.fieldGuardraillimitMB", "0"));
 
diff --git a/src/main/scala/datastax/astra/migrate/BaseJob.scala b/src/main/scala/datastax/astra/migrate/BaseJob.scala
@@ -48,7 +48,7 @@ class BaseJob extends App {
   val maxPartition = new BigInteger(Util.getSparkPropOr(sc, "spark.origin.maxPartition", "9223372036854775807"))
   val coveragePercent = Util.getSparkPropOr(sc, "spark.coveragePercent", "100")
   val splitSize = Integer.parseInt(Util.getSparkPropOr(sc, "spark.splitSize", "10000"))
-
+  
   protected def exitSpark() = {
     spark.stop()
     abstractLogger.info("################################################################################################")
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -4,12 +4,6 @@ spark.origin.username                             some-username
 spark.origin.password                             some-secret-password
 spark.origin.read.consistency.level               LOCAL_QUORUM
 spark.origin.keyspaceTable                        test.a1
-spark.origin.checkTableforColSize                 false
-spark.origin.checkTableforColSize.cols            partition-key,clustering-key
-spark.origin.checkTableforColSize.cols.types      9,1
-spark.origin.FilterColumn                         test
-spark.origin.FilterColumnIndex                    2
-spark.origin.FilterColumnType                     6%16
 
 spark.target.isAstra                              true
 spark.target.scb                                  file:///aaa/bbb/secure-connect-enterprise.zip
@@ -46,6 +40,18 @@ spark.origin.writeTimeStampFilter                 false
 spark.origin.minWriteTimeStampFilter              0
 spark.origin.maxWriteTimeStampFilter              9223372036854775807
 
+################### ONLY USE if needing to get record count of recs greater than 10MB from Origin ######################
+#spark.origin.checkTableforColSize                 false
+#spark.origin.checkTableforColSize.cols            partition-key,clustering-key
+#spark.origin.checkTableforColSize.cols.types      9,1
+
+########################## ONLY USE if needing to filter data from Origin ###############################
+#spark.origin.FilterData                           false
+#spark.origin.FilterColumn                         test
+#spark.origin.FilterColumnIndex                    2
+#spark.origin.FilterColumnType                     6%16
+#spark.origin.FilterColumnValue                    test
+
 ########################## ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE ###############################
 #spark.origin.trustStore.path
 #spark.origin.trustStore.password