datastax
diff --git a/‎Dockerfile
Lines changed: 1 addition & 1 deletion b/‎Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 16 additions & 6 deletions b/‎README.md
Lines changed: 16 additions & 6 deletions
diff --git a/‎pom.xml
Lines changed: 1 addition & 1 deletion b/‎pom.xml
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/datastax/astra/migrate/AbstractJobSession.java
Lines changed: 5 additions & 22 deletions b/‎src/main/java/datastax/astra/migrate/AbstractJobSession.java
Lines changed: 5 additions & 22 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/BaseJobSession.java
Lines changed: 38 additions & 11 deletions b/‎src/main/java/datastax/astra/migrate/BaseJobSession.java
Lines changed: 38 additions & 11 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/CopyJobSession.java
Lines changed: 0 additions & 3 deletions b/‎src/main/java/datastax/astra/migrate/CopyJobSession.java
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/DiffJobSession.java
Lines changed: 6 additions & 7 deletions b/‎src/main/java/datastax/astra/migrate/DiffJobSession.java
Lines changed: 6 additions & 7 deletions
@@ -25,7 +25,7 @@ ENV MAVEN_HOME /usr/share/maven
 ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2"
 COPY ./src /assets/src
 COPY ./pom.xml /assets/pom.xml
-COPY ./src/resources/sparkConf.properties /assets/
+COPY src/resources/cdm.properties /assets/
 COPY ./src/resources/partitions.csv /assets/
 COPY ./src/resources/primary_key_rows.csv /assets/
 COPY ./src/resources/runCommands.txt /assets/
 
@@ -24,13 +24,13 @@ tar -xvzf spark-3.3.1-bin-hadoop3.tgz
 
 # Steps for Data-Migration:
 
-1. `sparkConf.properties` file needs to be configured as applicable for the environment
-   > A sample Spark conf file configuration can be [found here](./src/resources/sparkConf.properties)
+1. `cdm.properties` file needs to be configured as applicable for the environment
+   > A sample properties file can be [found here](./src/resources/cdm.properties)
 2. Place the conf file where it can be accessed while running the job via spark-submit.
 3. Run the below job using `spark-submit` command as shown below:
 
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
 --master "local[*]" /
 --class datastax.astra.migrate.Migrate cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -39,7 +39,7 @@ Note:
 - Above command generates a log file `logfile_name.txt` to avoid log output on the console.
 - Add option `--driver-memory 25G --executor-memory 25G` as shown below if the table migrated is large (over 100GB)
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
 --master "local[*]" --driver-memory 25G --executor-memory 25G /
 --class datastax.astra.migrate.Migrate cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -49,7 +49,7 @@ Note:
 - To run the job in Data validation mode, use class option `--class datastax.astra.migrate.DiffData` as shown below
 
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
 --master "local[*]" /
 --class datastax.astra.migrate.DiffData cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -79,7 +79,7 @@ Note:
 # Migrating specific partition ranges
 - You can also use the tool to migrate specific partition ranges using class option `--class datastax.astra.migrate.MigratePartitionsFromFile` as shown below
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
 --master "local[*]" /
 --class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -93,11 +93,21 @@ When running in above mode the tool assumes a `partitions.csv` file to be presen
 ```
 This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
 
+# Perform large-field Guardrail violation checks
+- The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class datastax.astra.migrate.Guardrail` as shown below
+```
+./spark-submit --properties-file cdmGuardrail.properties /
+--master "local[*]" /
+--class datastax.astra.migrate.Guardrail cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
+```
+> A sample Guardrail properties file can be [found here](./src/resources/cdmGuardrail.properties)
+
 # Features
 - Supports migration/validation of [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTLs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Supports migration/validation of advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
 - Filter records from `Origin` using `writetimes` and/or CQL conditions and/or min/max token-range
+- Perform guardrail checks (identify large fields)
 - Supports adding `constants` as new columns on `Target`
 - Fully containerized (Docker and K8s friendly)
 - SSL Support (including custom cipher algorithms)
 
@@ -8,7 +8,7 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <revision>3.2.3</revision>
+    <revision>3.3.1</revision>
     <scala.version>2.12.17</scala.version>
     <scala.main.version>2.12</scala.main.version>
     <spark.version>3.3.1</spark.version>
 
@@ -19,6 +19,8 @@
 public class AbstractJobSession extends BaseJobSession {
 
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+    protected CqlSession sourceSession;
+    protected CqlSession astraSession;
 
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) {
         this(sourceSession, astraSession, sc, false);
@@ -34,18 +36,11 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
-        batchSize = new Integer(Util.getSparkPropOr(sc, "spark.batchSize", "5"));
-        fetchSizeInRows = new Integer(Util.getSparkPropOr(sc, "spark.read.fetch.sizeInRows", "1000"));
-        printStatsAfter = new Integer(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000"));
-        if (printStatsAfter < 1) {
-            printStatsAfter = 100000;
-        }
+        batchSize = Integer.parseInt(Util.getSparkPropOr(sc, "spark.batchSize", "5"));
+        fetchSizeInRows = Integer.parseInt(Util.getSparkPropOr(sc, "spark.read.fetch.sizeInRows", "1000"));
 
-        readLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000")));
-        writeLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.writeRateLimit", "40000")));
+        writeLimiter = RateLimiter.create(Integer.parseInt(Util.getSparkPropOr(sc, "spark.writeRateLimit", "40000")));
         maxRetries = Integer.parseInt(sc.get("spark.maxRetries", "0"));
-
-        sourceKeyspaceTable = Util.getSparkProp(sc, "spark.origin.keyspaceTable");
         astraKeyspaceTable = Util.getSparkProp(sc, "spark.target.keyspaceTable");
 
         String ttlColsStr = Util.getSparkPropOrEmpty(sc, "spark.query.ttl.cols");
@@ -105,24 +100,13 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
                     Instant.ofEpochMilli(maxWriteTimeStampFilter / 1000));
         }
 
-        String selectCols = Util.getSparkProp(sc, "spark.query.origin");
-        String partitionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
-        String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
-        if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
-            sourceSelectCondition = " AND " + sourceSelectCondition;
-        }
-
         final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
-        allCols = selectCols.split(",");
         ttlCols.forEach(col -> {
             selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")");
         });
         writeTimeStampCols.forEach(col -> {
             selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")");
         });
-        selectColTypes = getTypes(Util.getSparkProp(sc, "spark.query.types"));
-        String idCols = Util.getSparkPropOrEmpty(sc, "spark.query.target.id");
-        idColTypes = selectColTypes.subList(0, idCols.split(",").length);
 
         String insertCols = Util.getSparkPropOrEmpty(sc, "spark.query.target");
         if (null == insertCols || insertCols.trim().isEmpty()) {
@@ -152,7 +136,6 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
                 "select " + insertCols + " from " + astraKeyspaceTable
                         + " where " + insertBinds);
 
-        hasRandomPartitioner = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.origin.hasRandomPartitioner", "false"));
         isCounterTable = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.counterTable", "false"));
         if (isCounterTable) {
             String updateSelectMappingStr = Util.getSparkPropOr(sc, "spark.counterTable.cql.index", "0");
 
@@ -1,16 +1,18 @@
 package datastax.astra.migrate;
 
 import com.datastax.oss.driver.api.core.ConsistencyLevel;
-import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import org.apache.commons.lang.SerializationUtils;
 import org.apache.spark.SparkConf;
 
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
 
 public abstract class BaseJobSession {
 
@@ -29,9 +31,8 @@ public abstract class BaseJobSession {
     protected RateLimiter readLimiter;
     protected RateLimiter writeLimiter;
     protected Integer maxRetries = 10;
+    protected AtomicLong readCounter = new AtomicLong(0);
 
-    protected CqlSession sourceSession;
-    protected CqlSession astraSession;
     protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
     protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
     protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
@@ -47,7 +48,7 @@ public abstract class BaseJobSession {
 
     protected List<Integer> writeTimeStampCols = new ArrayList<Integer>();
     protected List<Integer> ttlCols = new ArrayList<Integer>();
-    protected Boolean isCounterTable;
+    protected Boolean isCounterTable = false;
 
     protected String sourceKeyspaceTable;
     protected String astraKeyspaceTable;
@@ -59,13 +60,35 @@ public abstract class BaseJobSession {
     protected Integer filterColIndex;
     protected String filterColValue;
 
+    protected String selectCols;
+    protected String partitionKey;
+    protected String sourceSelectCondition;
     protected String[] allCols;
+    protected String idCols;
     protected String tsReplaceValStr;
     protected long tsReplaceVal;
 
     protected BaseJobSession(SparkConf sc) {
         readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read"));
         writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write"));
+        readLimiter = RateLimiter.create(Integer.parseInt(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000")));
+        sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable");
+        hasRandomPartitioner = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.origin.hasRandomPartitioner", "false"));
+
+        selectCols = Util.getSparkProp(sc, "spark.query.origin");
+        allCols = selectCols.split(",");
+        partitionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
+        sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
+        if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
+            sourceSelectCondition = " AND " + sourceSelectCondition;
+        }
+        selectColTypes = getTypes(Util.getSparkProp(sc, "spark.query.types"));
+        idCols = Util.getSparkPropOrEmpty(sc, "spark.query.target.id");
+        idColTypes = selectColTypes.subList(0, idCols.split(",").length);
+        printStatsAfter = Integer.parseInt(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000"));
+        if (printStatsAfter < 1) {
+            printStatsAfter = 100000;
+        }
     }
 
     public String getKey(Row sourceRow) {
@@ -91,20 +114,24 @@ public List<MigrateDataType> getTypes(String types) {
         return dataTypes;
     }
 
-    public Object getData(MigrateDataType dataType, int index, Row sourceRow) {
+    public Object getData(MigrateDataType dataType, int index, Row row) {
         if (dataType.typeClass == Map.class) {
-            return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
+            return row.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
         } else if (dataType.typeClass == List.class) {
-            return sourceRow.getList(index, dataType.subTypes.get(0));
+            return row.getList(index, dataType.subTypes.get(0));
         } else if (dataType.typeClass == Set.class) {
-            return sourceRow.getSet(index, dataType.subTypes.get(0));
+            return row.getSet(index, dataType.subTypes.get(0));
         } else if (isCounterTable && dataType.typeClass == Long.class) {
-            Object data = sourceRow.get(index, dataType.typeClass);
+            Object data = row.get(index, dataType.typeClass);
             if (data == null) {
-                return new Long(0);
+                return Long.valueOf(0);
             }
         }
 
-        return sourceRow.get(index, dataType.typeClass);
+        return row.get(index, dataType.typeClass);
+    }
+
+    public int getFieldSize(MigrateDataType dataType, int index, Row row) {
+        return SerializationUtils.serialize((Serializable) getData(dataType, index, row)).length;
     }
 }
@@ -16,7 +16,6 @@ public class CopyJobSession extends AbstractJobSession {
 
     private static CopyJobSession copyJobSession;
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-    protected AtomicLong readCounter = new AtomicLong(0);
     protected AtomicLong skippedCounter = new AtomicLong(0);
     protected AtomicLong writeCounter = new AtomicLong(0);
     protected AtomicLong errorCounter = new AtomicLong(0);
@@ -50,7 +49,6 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
             long readCnt = 0;
             long writeCnt = 0;
             long skipCnt = 0;
-            long errCnt = 0;
             try {
                 ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
                                 min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
@@ -155,7 +153,6 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         CompletionStage<AsyncResultSet> writeResultSet = astraSession.executeAsync(batchStatement);
                         writeResults.add(writeResultSet);
                         writeCnt += iterateAndClearWriteResults(writeResults, batchStatement.size());
-                        batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
                     }
                 }
 
 
@@ -22,16 +22,15 @@
 public class DiffJobSession extends CopyJobSession {
 
     private static DiffJobSession diffJobSession;
+    private final AtomicLong mismatchCounter = new AtomicLong(0);
+    private final AtomicLong missingCounter = new AtomicLong(0);
+    private final AtomicLong correctedMissingCounter = new AtomicLong(0);
+    private final AtomicLong correctedMismatchCounter = new AtomicLong(0);
+    private final AtomicLong validCounter = new AtomicLong(0);
+    private final AtomicLong skippedCounter = new AtomicLong(0);
     public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
     protected Boolean autoCorrectMissing = false;
     protected Boolean autoCorrectMismatch = false;
-    private AtomicLong readCounter = new AtomicLong(0);
-    private AtomicLong mismatchCounter = new AtomicLong(0);
-    private AtomicLong missingCounter = new AtomicLong(0);
-    private AtomicLong correctedMissingCounter = new AtomicLong(0);
-    private AtomicLong correctedMismatchCounter = new AtomicLong(0);
-    private AtomicLong validCounter = new AtomicLong(0);
-    private AtomicLong skippedCounter = new AtomicLong(0);
 
     private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) {
         super(sourceSession, astraSession, sc);