datastax
diff --git a/‎README.md
Lines changed: 12 additions & 25 deletions b/‎README.md
Lines changed: 12 additions & 25 deletions
diff --git a/‎RELEASE.md
Lines changed: 3 additions & 0 deletions b/‎RELEASE.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/AbstractJobSession.java
Lines changed: 4 additions & 4 deletions b/‎src/main/java/com/datastax/cdm/job/AbstractJobSession.java
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/BaseJobSession.java
Lines changed: 17 additions & 17 deletions b/‎src/main/java/com/datastax/cdm/job/BaseJobSession.java
Lines changed: 17 additions & 17 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/CopyJobSession.java
Lines changed: 2 additions & 4 deletions b/‎src/main/java/com/datastax/cdm/job/CopyJobSession.java
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/CopyPKJobSession.java
Lines changed: 10 additions & 11 deletions b/‎src/main/java/com/datastax/cdm/job/CopyPKJobSession.java
Lines changed: 10 additions & 11 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/DiffJobSession.java
Lines changed: 1 addition & 2 deletions b/‎src/main/java/com/datastax/cdm/job/DiffJobSession.java
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/main/java/com/datastax/cdm/job/SplitPartitions.java
Lines changed: 26 additions & 14 deletions b/‎src/main/java/com/datastax/cdm/job/SplitPartitions.java
Lines changed: 26 additions & 14 deletions
diff --git a/‎src/main/java/com/datastax/cdm/properties/KnownProperties.java
Lines changed: 2 additions & 5 deletions b/‎src/main/java/com/datastax/cdm/properties/KnownProperties.java
Lines changed: 2 additions & 5 deletions
@@ -82,40 +82,27 @@ spark.cdm.autocorrect.mismatch                    false|true
 Note:
 - The validation job will never delete records from target i.e. it only adds or updates data on target
 
-# Migrating specific partition ranges
-- You can also use the tool to migrate specific partition ranges using class option `--class com.datastax.cdm.job.MigratePartitionsFromFile` as shown below
-```
-./spark-submit --properties-file cdm.properties /
---conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
---master "local[*]" /
---class com.datastax.cdm.job.MigratePartitionsFromFile cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
-```
-
-When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder in the below format, where each line (`min,max`) represents a partition-range
+# Migrating or Validating specific partition ranges
+- You can also use the tool to Migrate or Validate specific partition ranges by using a partition-file with the name `./<keyspace>.<tablename>_partitions.csv` in the below format in the current folder as input
 ```
 -507900353496146534,-107285462027022883
 -506781526266485690,1506166634797362039
 2637884402540451982,4638499294009575633
 798869613692279889,8699484505161403540
 ```
-This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
-
-> **Note:**
-> A file ending with `*_partitions.csv` will be auto created by the Migration & Validation job in the above format containing any failed partition ranges. Just rename it as below & run the above job.
+Each line above represents a partition-range (`min,max`). Alternatively, you can also pass the partition-file via command-line param as shown below
 
 ```
-mv <keyspace>.<table>_partitions.csv partitions.csv
-```
-# Data validation for specific partition ranges
-- You can also use the tool to validate data for a specific partition ranges using class option `--class com.datastax.cdm.job.DiffPartitionsFromFile` as shown below,
-```
-./spark-submit --properties-file cdm.properties /
---conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
---master "local[*]" /
---class com.datastax.cdm.job.DiffPartitionsFromFile cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
+spark-submit --properties-file cdm.properties /
+ --conf spark.cdm.schema.origin.keyspaceTable="test_ks.cat_promo" /
+ --conf spark.tokenRange.partitionFile="/<path-to-file>.<csv-input-filename>" /
+ --master "local[*]" /
+ --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
+This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
 
-When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder.
+> **Note:**
+> A file named `./<keyspace>.<tablename>_partitions.csv` will be auto created by the Migration & Validation job in the above format containing any failed partition ranges. You can use this file as an input to process any failed partition in a following run.
 
 # Perform large-field Guardrail violation checks
 - The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class com.datastax.cdm.job.GuardrailCheck` as shown below
@@ -132,7 +119,7 @@ When running in above mode the tool assumes a `partitions.csv` file to be presen
     - Including counter table [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTLs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Supports migration/validation of advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
-- Filter records from `Origin` using `writetimes` and/or CQL conditions and/or min/max token-range
+- Filter records from `Origin` using `writetimes` and/or CQL conditions and/or a list of token-ranges
 - Perform guardrail checks (identify large fields)
 - Supports adding `constants` as new columns on `Target`
 - Supports expanding `Map` columns on `Origin` into multiple records on `Target`
 
@@ -1,4 +1,7 @@
 # Release Notes
+## [4.0.2] - 2023-06-20
+- Refactored exception handling and loading of token-range filters to use the same Migrate & DiffData jobs instead of separate jobs to reduce code & maintenance overhead
+
 ## [4.0.2] - 2023-06-16
 - Capture failed partitions in a file for easier reruns 
 - Optimized mvn to reduce jar size
 
@@ -19,10 +19,12 @@ public abstract class AbstractJobSession<T> extends BaseJobSession {
     protected EnhancedSession targetSession;
     protected Guardrail guardrailFeature;
     protected boolean guardrailEnabled;
-    protected String tokenRangeExceptionDir;
+    protected String partitionFile = SplitPartitions.getPartitionFile(propertyHelper);
+
     protected AbstractJobSession(CqlSession originSession, CqlSession targetSession, SparkConf sc) {
         this(originSession, targetSession, sc, false);
     }
+
     protected AbstractJobSession(CqlSession originSession, CqlSession targetSession, SparkConf sc, boolean isJobMigrateRowsFromFile) {
         super(sc);
 
@@ -41,10 +43,8 @@ protected AbstractJobSession(CqlSession originSession, CqlSession targetSession,
         rateLimiterTarget = RateLimiter.create(propertyHelper.getInteger(KnownProperties.PERF_RATELIMIT_TARGET));
         maxRetries = propertyHelper.getInteger(KnownProperties.MAX_RETRIES);
 
-        tokenRangeExceptionDir = propertyHelper.getString(KnownProperties.TOKEN_RANGE_EXCEPTION_DIR);
-
         logger.info("PARAM -- Max Retries: {}", maxRetries);
-        logger.info("PARAM -- Token range exception dir: {}", tokenRangeExceptionDir);
+        logger.info("PARAM -- Partition file: {}", partitionFile);
         logger.info("PARAM -- Origin Rate Limit: {}", rateLimiterOrigin.getRate());
         logger.info("PARAM -- Target Rate Limit: {}", rateLimiterTarget.getRate());
 
 
@@ -3,6 +3,7 @@
 import com.datastax.cdm.feature.Feature;
 import com.datastax.cdm.feature.FeatureFactory;
 import com.datastax.cdm.feature.Featureset;
+import com.datastax.cdm.properties.KnownProperties;
 import com.datastax.cdm.properties.PropertyHelper;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
 import org.apache.commons.lang3.StringUtils;
@@ -25,14 +26,13 @@ public abstract class BaseJobSession {
 
     public static final String THREAD_CONTEXT_LABEL = "ThreadLabel";
     protected static final String NEW_LINE = System.lineSeparator();
+    private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected PropertyHelper propertyHelper = PropertyHelper.getInstance();
     protected Map<Featureset, Feature> featureMap;
-
     protected RateLimiter rateLimiterOrigin;
     protected RateLimiter rateLimiterTarget;
     protected Integer maxRetries = 10;
     protected Integer printStatsAfter = 100000;
-    private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
 
     protected BaseJobSession(SparkConf sc) {
         propertyHelper.initializeSparkConf(sc);
@@ -68,26 +68,26 @@ protected String getThreadLabel(BigInteger min, BigInteger max) {
         return formattedMin + ":" + formattedMax;
     }
 
-    private void appendToFile(Path path, String content)
-            throws IOException {
-        // if file not exists, create and write, else append
-        Files.write(path, content.getBytes(StandardCharsets.UTF_8),
-                StandardOpenOption.CREATE,
-                StandardOpenOption.APPEND);
-    }
+    private void appendToFile(String filePath, String content) throws IOException {
+        if (StringUtils.isAllBlank(filePath)) {
+            filePath = "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
 
-    private void FileAppend(String dir, String fileName, String content) throws IOException {
-        if (StringUtils.isAllBlank(dir)) {
-            dir = "./"; // use current folder by default
         }
-        Files.createDirectories(Paths.get(dir));
-        Path path = Paths.get(dir + "/" + fileName + "_partitions.csv");
-        appendToFile(path, content + NEW_LINE);
+        Path path = Paths.get(filePath);
+        if (StringUtils.isNotBlank(path.getParent().toString())) {
+            Files.createDirectories(path.getParent());
+        } else {
+            path = Paths.get("./" + filePath);
+        }
+
+        Files.write(path, (content + NEW_LINE).getBytes(StandardCharsets.UTF_8),
+                StandardOpenOption.CREATE,
+                StandardOpenOption.APPEND);
     }
 
-    protected void logFailedPartitionsInFile(String dir, String fileName, BigInteger min, BigInteger max) {
+    protected void logFailedPartitionsInFile(String partitionFile, BigInteger min, BigInteger max) {
         try {
-            FileAppend(dir, fileName, min + "," + max);
+            appendToFile(partitionFile, min + "," + max);
         } catch (Exception ee) {
             logger.error("Error occurred while writing to token range file min: {} max: {}", min, max, ee);
         }
 
@@ -6,7 +6,6 @@
 import com.datastax.cdm.data.PKFactory;
 import com.datastax.cdm.data.Record;
 import com.datastax.cdm.feature.Guardrail;
-import com.datastax.cdm.properties.KnownProperties;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.*;
 import org.apache.logging.log4j.ThreadContext;
@@ -26,14 +25,14 @@ public class CopyJobSession extends AbstractJobSession<SplitPartitions.Partition
     private final PKFactory pkFactory;
     private final boolean isCounterTable;
     private final Integer fetchSize;
+    private final Integer batchSize;
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected AtomicLong readCounter = new AtomicLong(0);
     protected AtomicLong skippedCounter = new AtomicLong(0);
     protected AtomicLong writeCounter = new AtomicLong(0);
     protected AtomicLong errorCounter = new AtomicLong(0);
     private TargetUpsertStatement targetUpsertStatement;
     private TargetSelectByPKStatement targetSelectByPKStatement;
-    private final Integer batchSize;
 
     protected CopyJobSession(CqlSession originSession, CqlSession targetSession, SparkConf sc) {
         super(originSession, targetSession, sc);
@@ -128,8 +127,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                     writeCounter.addAndGet(flushedWriteCnt);
                     skippedCounter.addAndGet(skipCnt);
                     errorCounter.addAndGet(readCnt - flushedWriteCnt - skipCnt);
-                    logFailedPartitionsInFile(tokenRangeExceptionDir,
-                            propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE), min, max);
+                    logFailedPartitionsInFile(partitionFile, min, max);
                 }
                 logger.error("Error occurred during Attempt#: {}", attempts, e);
                 logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",
 
@@ -1,12 +1,12 @@
 package com.datastax.cdm.job;
 
-import com.datastax.cdm.feature.Guardrail;
-import com.datastax.oss.driver.api.core.CqlSession;
-import com.datastax.oss.driver.api.core.cql.Row;
+import com.datastax.cdm.cql.statement.OriginSelectByPKStatement;
 import com.datastax.cdm.data.EnhancedPK;
 import com.datastax.cdm.data.PKFactory;
 import com.datastax.cdm.data.Record;
-import com.datastax.cdm.cql.statement.OriginSelectByPKStatement;
+import com.datastax.cdm.feature.Guardrail;
+import com.datastax.oss.driver.api.core.CqlSession;
+import com.datastax.oss.driver.api.core.cql.Row;
 import org.apache.spark.SparkConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -20,15 +20,14 @@
 public class CopyPKJobSession extends AbstractJobSession<SplitPartitions.PKRows> {
 
     private static CopyPKJobSession copyJobSession;
+    private final PKFactory pkFactory;
+    private final List<Class> originPKClasses;
+    private final boolean isCounterTable;
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected AtomicLong readCounter = new AtomicLong(0);
     protected AtomicLong missingCounter = new AtomicLong(0);
     protected AtomicLong skipCounter = new AtomicLong(0);
     protected AtomicLong writeCounter = new AtomicLong(0);
-
-    private final PKFactory pkFactory;
-    private final List<Class> originPKClasses;
-    private final boolean isCounterTable;
     private OriginSelectByPKStatement originSelectByPKStatement;
 
     protected CopyPKJobSession(CqlSession originSession, CqlSession targetSession, SparkConf sc) {
@@ -37,7 +36,7 @@ protected CopyPKJobSession(CqlSession originSession, CqlSession targetSession, S
         isCounterTable = this.originSession.getCqlTable().isCounterTable();
         originPKClasses = this.originSession.getCqlTable().getPKClasses();
 
-        logger.info("CQL -- origin select: {}",this.originSession.getOriginSelectByPKStatement().getCQL());
+        logger.info("CQL -- origin select: {}", this.originSession.getOriginSelectByPKStatement().getCQL());
     }
 
     @Override
@@ -47,7 +46,7 @@ public void processSlice(SplitPartitions.PKRows slice) {
 
     public void getRowAndInsert(SplitPartitions.PKRows rowsList) {
         originSelectByPKStatement = originSession.getOriginSelectByPKStatement();
-        for (String row : rowsList.pkRows) {
+        for (String row : rowsList.getPkRows()) {
             readCounter.incrementAndGet();
             EnhancedPK pk = toEnhancedPK(row);
             if (null == pk || pk.isError()) {
@@ -110,7 +109,7 @@ private EnhancedPK toEnhancedPK(String rowString) {
         String[] pkFields = rowString.split(" %% ");
         List<Object> values = new ArrayList<>(originPKClasses.size());
         if (logger.isDebugEnabled()) logger.debug("rowString={}, pkFields={}", rowString, pkFields);
-        for (int i=0; i<pkFields.length; i++) {
+        for (int i = 0; i < pkFields.length; i++) {
             PropertyEditor editor = PropertyEditorManager.findEditor(originPKClasses.get(i));
             editor.setAsText(pkFields[i]);
             values.add(editor.getValue());
 
@@ -155,8 +155,7 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
                 logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",
                         Thread.currentThread().getId(), min, max, attempts);
                 if (attempts == maxAttempts) {
-                    logFailedPartitionsInFile(tokenRangeExceptionDir,
-                            propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE), min, max);
+                    logFailedPartitionsInFile(partitionFile, min, max);
                 }
             }
         }
 
@@ -1,12 +1,13 @@
 package com.datastax.cdm.job;
 
+import com.datastax.cdm.properties.KnownProperties;
+import com.datastax.cdm.properties.PropertyHelper;
+import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.*;
 import java.math.BigInteger;
-import java.nio.file.Files;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -116,10 +117,30 @@ private static List<Partition> getSubPartitions(int numSplits, BigInteger min, B
         return partitions;
     }
 
+    private static BufferedReader getfileReader(String fileName) {
+        try {
+            return new BufferedReader(new FileReader(fileName));
+        } catch (FileNotFoundException fnfe) {
+            throw new RuntimeException("No '" + fileName + "' file found!! Add this file in the current folder & rerun!");
+        }
+    }
+
+    public static String getPartitionFile(PropertyHelper propertyHelper) {
+        String filePath = propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);
+        if (StringUtils.isAllBlank(filePath)) {
+            filePath = "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
+        }
+
+        return filePath;
+    }
+
     public static class PKRows implements Serializable {
         private static final long serialVersionUID = 1L;
+        private List<String> pkRows;
 
-        List<String> pkRows;
+        public List<String> getPkRows() {
+            return pkRows;
+        }
 
         public PKRows(List<String> rows) {
             pkRows = new ArrayList<>(rows);
@@ -129,8 +150,8 @@ public PKRows(List<String> rows) {
     public static class Partition implements Serializable {
         private static final long serialVersionUID = 1L;
 
-        private BigInteger min;
-        private BigInteger max;
+        private final BigInteger min;
+        private final BigInteger max;
 
         public Partition(BigInteger min, BigInteger max) {
             this.min = min;
@@ -149,13 +170,4 @@ public String toString() {
             return "Processing partition for token range " + min + " to " + max;
         }
     }
-
-    private static BufferedReader getfileReader(String fileName) {
-        try {
-            return new BufferedReader(new FileReader(fileName));
-        } catch (FileNotFoundException fnfe) {
-            throw new RuntimeException("No '" + fileName + "' file found!! Add this file in the current folder & rerun!");
-        }
-    }
-
 }
@@ -26,7 +26,6 @@ public enum PropertyType {
     public static final String CONNECT_ORIGIN_USERNAME       = "spark.cdm.connect.origin.username";
     public static final String CONNECT_ORIGIN_PASSWORD       = "spark.cdm.connect.origin.password";
 
-
     public static final String CONNECT_TARGET_HOST           = "spark.cdm.connect.target.host";
     public static final String CONNECT_TARGET_PORT           = "spark.cdm.connect.target.port";
     public static final String CONNECT_TARGET_SCB            = "spark.cdm.connect.target.scb";
@@ -53,7 +52,6 @@ public enum PropertyType {
         defaults.put(CONNECT_TARGET_USERNAME, "cassandra");
            types.put(CONNECT_TARGET_PASSWORD, PropertyType.STRING);
         defaults.put(CONNECT_TARGET_PASSWORD, "cassandra");
-
     }
 
     //==========================================================================
@@ -76,7 +74,6 @@ public enum PropertyType {
            types.put(ORIGIN_WRITETIME_NAMES, PropertyType.STRING_LIST);
            types.put(ORIGIN_WRITETIME_AUTO, PropertyType.BOOLEAN);
         defaults.put(ORIGIN_WRITETIME_AUTO, "true");
-
            types.put(ORIGIN_COLUMN_NAMES_TO_TARGET, PropertyType.STRING_LIST);
     }
 
@@ -139,9 +136,9 @@ public enum PropertyType {
     //==========================================================================
     // Error handling
     //==========================================================================
-    public static final String TOKEN_RANGE_EXCEPTION_DIR = "spark.tokenRange.exceptionDir";
+    public static final String TOKEN_RANGE_PARTITION_FILE = "spark.tokenrange.partitionFile";
     static {
-            types.put(TOKEN_RANGE_EXCEPTION_DIR, PropertyType.STRING);
+        types.put(TOKEN_RANGE_PARTITION_FILE, PropertyType.STRING);
     }
     //==========================================================================
     // Guardrails and Transformations
Original file line number	Diff line number	Diff line change
`@@ -155,8 +155,7 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {`
`155`	`155`	`logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",`
`156`	`156`	`Thread.currentThread().getId(), min, max, attempts);`
`157`	`157`	`if (attempts == maxAttempts) {`
`158`		`- logFailedPartitionsInFile(tokenRangeExceptionDir,`
`159`		`- propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE), min, max);`
	`158`	`+ logFailedPartitionsInFile(partitionFile, min, max);`
`160`	`159`	`}`
`161`	`160`	`}`
`162`	`161`	`}`