datastax
diff --git a/‎README.md
Lines changed: 9 additions & 0 deletions b/‎README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎pom.xml
Lines changed: 7 additions & 1 deletion b/‎pom.xml
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/main/java/datastax/astra/migrate/AbstractJobSession.java
Lines changed: 76 additions & 74 deletions b/‎src/main/java/datastax/astra/migrate/AbstractJobSession.java
Lines changed: 76 additions & 74 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/BaseJobSession.java
Lines changed: 26 additions & 49 deletions b/‎src/main/java/datastax/astra/migrate/BaseJobSession.java
Lines changed: 26 additions & 49 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/CopyJobSession.java
Lines changed: 5 additions & 4 deletions b/‎src/main/java/datastax/astra/migrate/CopyJobSession.java
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/CopyPKJobSession.java
Lines changed: 3 additions & 2 deletions b/‎src/main/java/datastax/astra/migrate/CopyPKJobSession.java
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/main/java/datastax/astra/migrate/DiffJobSession.java
Lines changed: 14 additions & 13 deletions b/‎src/main/java/datastax/astra/migrate/DiffJobSession.java
Lines changed: 14 additions & 13 deletions
@@ -31,6 +31,7 @@ tar -xvzf spark-3.3.1-bin-hadoop3.tgz
 
 ```
 ./spark-submit --properties-file cdm.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
 --class datastax.astra.migrate.Migrate cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -40,6 +41,7 @@ Note:
 - Add option `--driver-memory 25G --executor-memory 25G` as shown below if the table migrated is large (over 100GB)
 ```
 ./spark-submit --properties-file cdm.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" --driver-memory 25G --executor-memory 25G /
 --class datastax.astra.migrate.Migrate cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -50,6 +52,7 @@ Note:
 
 ```
 ./spark-submit --properties-file cdm.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
 --class datastax.astra.migrate.DiffData cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -80,6 +83,7 @@ Note:
 - You can also use the tool to migrate specific partition ranges using class option `--class datastax.astra.migrate.MigratePartitionsFromFile` as shown below
 ```
 ./spark-submit --properties-file cdm.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
 --class datastax.astra.migrate.MigratePartitionsFromFile cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
@@ -97,12 +101,14 @@ This mode is specifically useful to processes a subset of partition-ranges that
 - The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class datastax.astra.migrate.Guardrail` as shown below
 ```
 ./spark-submit --properties-file cdmGuardrail.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
 --class datastax.astra.migrate.Guardrail cassandra-data-migrator-3.x.x.jar &> logfile_name.txt
 ```
 > A sample Guardrail properties file can be [found here](./src/resources/cdmGuardrail.properties)
 
 # Features
+- Auto-detects table schema (column names, types, id fields, collections, UDTs, etc.)
 - Supports migration/validation of [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTLs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Supports migration/validation of advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
@@ -116,6 +122,9 @@ This mode is specifically useful to processes a subset of partition-ranges that
 - Validate migration accuracy and performance using a smaller randomized data-set
 - Supports adding custom fixed `writetime`
 
+# Known Limitations
+- This tool does not migrate `ttl` & `writetime` at the field-level (for optimization reasons). It instead finds the field with the highest `ttl` & the field with the highest `writetime` within an `origin` row and uses those values on the entire `target` row.
+
 # Building Jar for local development
 1. Clone this repo
 2. Move to the repo folder `cd cassandra-data-migrator`
 
@@ -8,7 +8,7 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <revision>3.3.1</revision>
+    <revision>3.4.0</revision>
     <scala.version>2.12.17</scala.version>
     <scala.main.version>2.12</scala.main.version>
     <spark.version>3.3.1</spark.version>
@@ -89,6 +89,12 @@
       <artifactId>log4j-to-slf4j</artifactId>
       <version>2.19.0</version>
     </dependency>
+      <dependency>
+        <groupId>org.projectlombok</groupId>
+        <artifactId>lombok</artifactId>
+        <version>1.18.26</version>
+        <scope>provided</scope>
+      </dependency>
 
     <!-- Test Dependencies -->
     <dependency>
 
@@ -4,6 +4,8 @@
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import datastax.astra.migrate.schema.TableInfo;
+import datastax.astra.migrate.schema.TypeInfo;
 import org.apache.commons.lang.SerializationUtils;
 import org.apache.spark.SparkConf;
 
@@ -33,8 +35,6 @@ public abstract class BaseJobSession {
     protected Integer maxRetries = 10;
     protected AtomicLong readCounter = new AtomicLong(0);
 
-    protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
-    protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
     protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
 
     protected Integer batchSize = 1;
@@ -46,8 +46,6 @@ public abstract class BaseJobSession {
     protected Long maxWriteTimeStampFilter = Long.MAX_VALUE;
     protected Long customWritetime = 0l;
 
-    protected List<Integer> writeTimeStampCols = new ArrayList<Integer>();
-    protected List<Integer> ttlCols = new ArrayList<Integer>();
     protected Boolean isCounterTable = false;
 
     protected String sourceKeyspaceTable;
@@ -59,79 +57,58 @@ public abstract class BaseJobSession {
     protected String filterColType;
     protected Integer filterColIndex;
     protected String filterColValue;
-
-    protected String selectCols;
-    protected String partitionKey;
     protected String sourceSelectCondition;
-    protected String[] allCols;
-    protected String idCols;
-    protected String tsReplaceValStr;
-    protected long tsReplaceVal;
 
     protected BaseJobSession(SparkConf sc) {
         readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read"));
         writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write"));
         readLimiter = RateLimiter.create(Integer.parseInt(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000")));
         sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable");
         hasRandomPartitioner = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.origin.hasRandomPartitioner", "false"));
-
-        selectCols = Util.getSparkProp(sc, "spark.query.origin");
-        allCols = selectCols.split(",");
-        partitionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
         sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
         if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
             sourceSelectCondition = " AND " + sourceSelectCondition;
         }
-        selectColTypes = getTypes(Util.getSparkProp(sc, "spark.query.types"));
-        idCols = Util.getSparkPropOrEmpty(sc, "spark.query.target.id");
-        idColTypes = selectColTypes.subList(0, idCols.split(",").length);
+
         printStatsAfter = Integer.parseInt(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000"));
         if (printStatsAfter < 1) {
             printStatsAfter = 100000;
         }
     }
 
-    public String getKey(Row sourceRow) {
-        StringBuffer key = new StringBuffer();
-        for (int index = 0; index < idColTypes.size(); index++) {
-            MigrateDataType dataType = idColTypes.get(index);
-            if (index == 0) {
-                key.append(getData(dataType, index, sourceRow));
-            } else {
-                key.append(" %% " + getData(dataType, index, sourceRow));
+    public Object getData(TypeInfo typeInfo, int index, Row row) {
+        if (typeInfo.getTypeClass() == Map.class) {
+            return row.getMap(index, typeInfo.getSubTypes().get(0), typeInfo.getSubTypes().get(1));
+        } else if (typeInfo.getTypeClass() == List.class) {
+            return row.getList(index, typeInfo.getSubTypes().get(0));
+        } else if (typeInfo.getTypeClass() == Set.class) {
+            return row.getSet(index, typeInfo.getSubTypes().get(0));
+        } else if (isCounterTable && typeInfo.getTypeClass() == Long.class) {
+            Object data = row.get(index, typeInfo.getTypeClass());
+            if (data == null) {
+                return Long.valueOf(0);
             }
         }
 
-        return key.toString();
+        return row.get(index, typeInfo.getTypeClass());
     }
 
-    public List<MigrateDataType> getTypes(String types) {
-        List<MigrateDataType> dataTypes = new ArrayList<MigrateDataType>();
-        for (String type : types.split(",")) {
-            dataTypes.add(new MigrateDataType(type));
-        }
-
-        return dataTypes;
+    public int getFieldSize(TypeInfo typeInfo, int index, Row row) {
+        return SerializationUtils.serialize((Serializable) getData(typeInfo, index, row)).length;
     }
 
-    public Object getData(MigrateDataType dataType, int index, Row row) {
-        if (dataType.typeClass == Map.class) {
-            return row.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
-        } else if (dataType.typeClass == List.class) {
-            return row.getList(index, dataType.subTypes.get(0));
-        } else if (dataType.typeClass == Set.class) {
-            return row.getSet(index, dataType.subTypes.get(0));
-        } else if (isCounterTable && dataType.typeClass == Long.class) {
-            Object data = row.get(index, dataType.typeClass);
-            if (data == null) {
-                return Long.valueOf(0);
+    public String getKey(Row sourceRow, TableInfo tableInfo) {
+        StringBuffer key = new StringBuffer();
+        for (int index = 0; index < tableInfo.getKeyColumns().size(); index++) {
+            TypeInfo typeInfo = tableInfo.getIdColumns().get(index).getTypeInfo();
+            if (index == 0) {
+                key.append(getData(typeInfo, index, sourceRow));
+            } else {
+                key.append(" %% " + getData(typeInfo, index, sourceRow));
             }
         }
 
-        return row.get(index, dataType.typeClass);
+        return key.toString();
     }
 
-    public int getFieldSize(MigrateDataType dataType, int index, Row row) {
-        return SerializationUtils.serialize((Serializable) getData(dataType, index, row)).length;
-    }
 }
@@ -2,6 +2,7 @@
 
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.*;
+import datastax.astra.migrate.schema.TypeInfo;
 import org.apache.spark.SparkConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -68,9 +69,9 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         }
 
                         if (filterData) {
-                            String col = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow);
+                            String col = (String) getData(new TypeInfo(filterColType), filterColIndex, sourceRow);
                             if (col.trim().equalsIgnoreCase(filterColValue)) {
-                                logger.warn("Skipping row and filtering out: {}", getKey(sourceRow));
+                                logger.warn("Skipping row and filtering out: {}", getKey(sourceRow, tableInfo));
                                 skipCnt++;
                                 continue;
                             }
@@ -117,9 +118,9 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         }
 
                         if (filterData) {
-                            String colValue = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow);
+                            String colValue = (String) getData(new TypeInfo(filterColType), filterColIndex, sourceRow);
                             if (colValue.trim().equalsIgnoreCase(filterColValue)) {
-                                logger.warn("Skipping row and filtering out: {}", getKey(sourceRow));
+                                logger.warn("Skipping row and filtering out: {}", getKey(sourceRow, tableInfo));
                                 skipCnt++;
                                 continue;
                             }
 
@@ -4,6 +4,7 @@
 import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
+import datastax.astra.migrate.schema.ColumnInfo;
 import org.apache.spark.SparkConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -44,8 +45,8 @@ public void getRowAndInsert(List<SplitPartitions.PKRows> rowsList) {
                 String[] pkFields = row.split(" %% ");
                 int idx = 0;
                 BoundStatement bspk = sourceSelectStatement.bind().setConsistencyLevel(readConsistencyLevel);
-                for (MigrateDataType tp : idColTypes) {
-                    bspk = bspk.set(idx, convert(tp.typeClass, pkFields[idx]), tp.typeClass);
+                for (ColumnInfo ci : tableInfo.getIdColumns()) {
+                    bspk = bspk.set(idx, convert(ci.getTypeInfo().getTypeClass(), pkFields[idx]), ci.getTypeInfo().getTypeClass());
                     idx++;
                 }
                 Row pkRow = sourceSession.execute(bspk).one();
 
@@ -6,6 +6,7 @@
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.api.core.data.UdtValue;
+import datastax.astra.migrate.schema.TypeInfo;
 import org.apache.spark.SparkConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -107,7 +108,7 @@ private void diffAndClear(Map<Row, CompletionStage<AsyncResultSet>> srcToTargetR
                 Row targetRow = srcToTargetRowMap.get(srcRow).toCompletableFuture().get().one();
                 diff(srcRow, targetRow);
             } catch (Exception e) {
-                logger.error("Could not perform diff for Key: {}", getKey(srcRow), e);
+                logger.error("Could not perform diff for Key: {}", getKey(srcRow, tableInfo), e);
             }
         }
         srcToTargetRowMap.clear();
@@ -134,13 +135,13 @@ public synchronized void printCounts(boolean isFinal) {
     private void diff(Row sourceRow, Row astraRow) {
         if (astraRow == null) {
             missingCounter.incrementAndGet();
-            logger.error("Missing target row found for key: {}", getKey(sourceRow));
+            logger.error("Missing target row found for key: {}", getKey(sourceRow, tableInfo));
             //correct data
 
             if (autoCorrectMissing) {
                 astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null));
                 correctedMissingCounter.incrementAndGet();
-                logger.error("Inserted missing row in target: {}", getKey(sourceRow));
+                logger.error("Inserted missing row in target: {}", getKey(sourceRow, tableInfo));
             }
 
             return;
@@ -149,7 +150,7 @@ private void diff(Row sourceRow, Row astraRow) {
         String diffData = isDifferent(sourceRow, astraRow);
         if (!diffData.isEmpty()) {
             mismatchCounter.incrementAndGet();
-            logger.error("Mismatch row found for key: {} Mismatch: {}", getKey(sourceRow), diffData);
+            logger.error("Mismatch row found for key: {} Mismatch: {}", getKey(sourceRow, tableInfo), diffData);
 
             if (autoCorrectMismatch) {
                 if (isCounterTable) {
@@ -158,7 +159,7 @@ private void diff(Row sourceRow, Row astraRow) {
                     astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null));
                 }
                 correctedMismatchCounter.incrementAndGet();
-                logger.error("Updated mismatch row in target: {}", getKey(sourceRow));
+                logger.error("Updated mismatch row in target: {}", getKey(sourceRow, tableInfo));
             }
 
             return;
@@ -169,21 +170,21 @@ private void diff(Row sourceRow, Row astraRow) {
 
     private String isDifferent(Row sourceRow, Row astraRow) {
         StringBuffer diffData = new StringBuffer();
-        IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> {
-            MigrateDataType dataTypeObj = selectColTypes.get(index);
-            Object source = getData(dataTypeObj, index, sourceRow);
-            if (index < idColTypes.size()) {
-                Optional<Object> optionalVal = handleBlankInPrimaryKey(index, source, dataTypeObj.typeClass, sourceRow, false);
+        IntStream.range(0, tableInfo.getAllColumns().size()).parallel().forEach(index -> {
+            TypeInfo typeInfo = tableInfo.getColumns().get(index).getTypeInfo();
+            Object source = getData(typeInfo, index, sourceRow);
+            if (index < tableInfo.getKeyColumns().size()) {
+                Optional<Object> optionalVal = handleBlankInPrimaryKey(index, source, typeInfo.getTypeClass(), sourceRow, false);
                 if (optionalVal.isPresent()) {
                     source = optionalVal.get();
                 }
             }
 
-            Object astra = getData(dataTypeObj, index, astraRow);
+            Object astra = getData(typeInfo, index, astraRow);
 
-            boolean isDiff = dataTypeObj.diff(source, astra);
+            boolean isDiff = typeInfo.diff(source, astra);
             if (isDiff) {
-                if (dataTypeObj.typeClass.equals(UdtValue.class)) {
+                if (typeInfo.getTypeClass().equals(UdtValue.class)) {
                     String sourceUdtContent = ((UdtValue) source).getFormattedContents();
                     String astraUdtContent = ((UdtValue) astra).getFormattedContents();
                     if (!sourceUdtContent.equals(astraUdtContent)) {