datastax
diff --git a/‎README.md
Lines changed: 13 additions & 35 deletions b/‎README.md
Lines changed: 13 additions & 35 deletions
diff --git a/‎RELEASE.md
Lines changed: 5 additions & 0 deletions b/‎RELEASE.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/main/java/com/datastax/cdm/cql/EnhancedSession.java
Lines changed: 1 addition & 1 deletion b/‎src/main/java/com/datastax/cdm/cql/EnhancedSession.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/com/datastax/cdm/cql/statement/OriginSelectByPKStatement.java
Lines changed: 0 additions & 4 deletions b/‎src/main/java/com/datastax/cdm/cql/statement/OriginSelectByPKStatement.java
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/main/java/com/datastax/cdm/cql/statement/OriginSelectByPartitionRangeStatement.java
Lines changed: 0 additions & 4 deletions b/‎src/main/java/com/datastax/cdm/cql/statement/OriginSelectByPartitionRangeStatement.java
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/main/java/com/datastax/cdm/cql/statement/TargetSelectByPKStatement.java
Lines changed: 0 additions & 4 deletions b/‎src/main/java/com/datastax/cdm/cql/statement/TargetSelectByPKStatement.java
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/main/java/com/datastax/cdm/cql/statement/TargetUpsertRunDetailsStatement.java
Lines changed: 125 additions & 0 deletions b/‎src/main/java/com/datastax/cdm/cql/statement/TargetUpsertRunDetailsStatement.java
Lines changed: 125 additions & 0 deletions
diff --git a/‎src/main/java/com/datastax/cdm/data/PKFactory.java
Lines changed: 0 additions & 1 deletion b/‎src/main/java/com/datastax/cdm/data/PKFactory.java
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/java/com/datastax/cdm/data/Record.java
Lines changed: 0 additions & 1 deletion b/‎src/main/java/com/datastax/cdm/data/Record.java
Lines changed: 0 additions & 1 deletion
@@ -83,50 +83,22 @@ spark.cdm.autocorrect.mismatch                    false|true
 Note:
 - The validation job will never delete records from target i.e. it only adds or updates data on target
 
-# Migrating or Validating specific partition ranges
-- You can also use the tool to Migrate or Validate specific partition ranges by using a partition-file with the name `./<keyspacename>.<tablename>_partitions.csv` in the below format in the current folder as input
-```
--507900353496146534,-107285462027022883
--506781526266485690,1506166634797362039
-2637884402540451982,4638499294009575633
-798869613692279889,8699484505161403540
-```
-Each line above represents a partition-range (`min,max`). Alternatively, you can also pass the partition-file via command-line param as shown below
-
-```
-./spark-submit --properties-file cdm.properties \
- --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
- --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
- --master "local[*]" --driver-memory 25G --executor-memory 25G \
- --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
-```
-This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
+# Rerun (previously incomplete) Migration or Validation 
+- You can rerun a Migration or Validation job to complete a previous run that could have stopped for any reasons. This mode will skip any token-ranges from previous run that were migrated or validated successfully. This is done by passing the `spark.cdm.trackRun.previousRunId` param as shown below
 
-A file named `./<keyspacename>.<tablename>_partitions.csv` is auto-generated by the Migration & Validation jobs in the above format containing any failed partition ranges. No file is created if there are no failed partitions. This file can be used as an input to process any failed partition in a following run. You can also specify a different output file using the `spark.cdm.tokenrange.partitionFile.output` option.
 ```
 ./spark-submit --properties-file cdm.properties \
  --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
- --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
- --conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
+ --conf spark.cdm.trackRun.previousRunId=<prev_run_id> \
  --master "local[*]" --driver-memory 25G --executor-memory 25G \
  --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
-
-For the Data-Validation step, use the conf option `-conf spark.cdm.tokenrange.partitionFile.appendOnDiff` as shown below. This allows the partition range to be outputted whenever there are differences, not just fails.
-```
-./spark-submit --properties-file cdm.properties \
- --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
- --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
- --conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
- --conf spark.cdm.tokenrange.partitionFile.appendOnDiff=true \
- --master "local[*]" --driver-memory 25G --executor-memory 25G \
- --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
-```
-
-If `spark.cdm.tokenrange.partitionFile.input` or `spark.cdm.tokenrange.partitionFile.output` are not specified, the system will use `./<keyspacename>.<tablename>_partitions.csv` as the default file.
+Note: 
+- This feature replaces and improves upon an older similar feature (using param `spark.cdm.tokenrange.partitionFile`) that is now deprecated and will be removed soon.
 
 # Perform large-field Guardrail violation checks
 - The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class com.datastax.cdm.job.GuardrailCheck` as shown below
+
 ```
 ./spark-submit --properties-file cdm.properties \
 --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
@@ -138,6 +110,8 @@ If `spark.cdm.tokenrange.partitionFile.input` or `spark.cdm.tokenrange.partition
 # Features
 - Auto-detects table schema (column names, types, keys, collections, UDTs, etc.)
     - Including counter table [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
+- Rerun job from where the previous job had stopped for any reason (killed, had exceptions, etc.)
+    - If you rerun a `validation` job, it will include any token-ranges that had differences in the previous run 
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTLs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Supports migration/validation of advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
 - Filter records from `Origin` using `writetimes` and/or CQL conditions and/or a list of token-ranges
@@ -150,14 +124,18 @@ If `spark.cdm.tokenrange.partitionFile.input` or `spark.cdm.tokenrange.partition
 - Supports migration/validation from and to [Azure Cosmos Cassandra](https://learn.microsoft.com/en-us/azure/cosmos-db/cassandra)
 - Validate migration accuracy and performance using a smaller randomized data-set
 - Supports adding custom fixed `writetime`
+- Track run information (start-time, end-time, status, etc.) in tables (`cdm_run_info` and `cdm_run_details`) on the target keyspace
 - Validation - Log partitions range level exceptions, use the exceptions file as input for rerun  
 
 # Things to know
+- Each run (Migration or Validation) can be tracked (when enabled). You can find summary and details of the same in tables `cdm_run_info` and `cdm_run_details` in the target keyspace.
 - CDM does not migrate `ttl` & `writetime` at the field-level (for optimization reasons). It instead finds the field with the highest `ttl` & the field with the highest `writetime` within an `origin` row and uses those values on the entire `target` row.
 - CDM ignores `ttl` & `writetime` on collection and UDT fields while computing the highest value
 - If a table has only collection and/or UDT non-key columns and not table-level `ttl` configuration, the target will have no `ttl`, which can lead to inconsistencies between `origin` and `target` as rows expire on `origin` due to `ttl` expiry.
 - If a table has only collection and/or UDT non-key columns, the `writetime` used on target will be time the job was run. Alternatively if needed, the param `spark.cdm.transform.custom.writetime` can be used to set a static custom value for `writetime`. 
-- When CDM migration (or validation with autocorrect) is run multiple times on the same table (for whatever reasons), it could lead to duplicate entries in `list` type columns. Note this is [due to a Cassandra/DSE bug](https://issues.apache.org/jira/browse/CASSANDRA-11368) and not a CDM issue. This issue can be addressed by enabling and setting a positive value for `spark.cdm.transform.custom.writetime.incrementBy` param. This param was specifically added to address this issue. 
+- When CDM migration (or validation with autocorrect) is run multiple times on the same table (for whatever reasons), it could lead to duplicate entries in `list` type columns. Note this is [due to a Cassandra/DSE bug](https://issues.apache.org/jira/browse/CASSANDRA-11368) and not a CDM issue. This issue can be addressed by enabling and setting a positive value for `spark.cdm.transform.custom.writetime.incrementBy` param. This param was specifically added to address this issue.
+- When you rerun job to resume from a previous run, the run metrics (read, write, skipped, etc.) captured in table `cdm_run_info` will be only for the current run. If the previous run was killed for some reasons, its run metrics may not have been saved. If the previous run did complete (not killed) but with errors, then you will have all run metrics from previous run as well.
+
 
 # Building Jar for local development
 1. Clone this repo
 
@@ -1,4 +1,9 @@
 # Release Notes
+## [4.3.0] - 2024-07-18
+- Added `spark.cdm.trackRun` feature to support stop and resume function for Migration and Validation jobs
+- Validation jobs ran with `auto-correct` feature disabled, can now be rerun with `auto-correct` feature enabled in a much optimal way to only correct the token-ranges with validation errors during the rerun
+- Records summary and details of each run in tables (`cdm_run_info` and `cdm_run_details`) on `target` keyspace
+
 ## [4.2.0] - 2024-07-09
 - Upgraded `constant-column` feature to support `replace` and `remove` of constant columns 
 - Fixed `constant-column` feature to support any data-types within the PK columns
 
@@ -98,7 +98,7 @@ private CqlSession initSession(PropertyHelper propertyHelper, CqlSession session
                     GenericType<?> javaType = codec.getJavaType();
                     if (logDebug) logger.debug("Registering Codec {} for CQL type {} and Java type {}", codec.getClass().getSimpleName(), dataType, javaType);
                     try {
-                        TypeCodec<?> existingCodec = registry.codecFor(dataType, javaType);
+                        registry.codecFor(dataType, javaType);
                     } catch (CodecNotFoundException e) {
                         registry.register(codec);
                     }
 
@@ -23,12 +23,8 @@
 import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class OriginSelectByPKStatement extends OriginSelectStatement {
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-
     public OriginSelectByPKStatement(IPropertyHelper propertyHelper, EnhancedSession session) {
         super(propertyHelper, session);
     }
 
@@ -23,14 +23,10 @@
 import com.datastax.cdm.properties.PropertyHelper;
 import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.math.BigInteger;
 
 public class OriginSelectByPartitionRangeStatement extends OriginSelectStatement {
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-
     public OriginSelectByPartitionRangeStatement(IPropertyHelper propertyHelper, EnhancedSession session) {
         super(propertyHelper, session);
     }
 
@@ -26,14 +26,10 @@
 import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.util.concurrent.CompletionStage;
 
 public class TargetSelectByPKStatement extends BaseCdmStatement {
-    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
-
     public TargetSelectByPKStatement(IPropertyHelper propertyHelper, EnhancedSession session) {
         super(propertyHelper, session);
         this.statement = buildStatement();
 
@@ -0,0 +1,125 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.datastax.cdm.cql.statement;
+
+import java.math.BigInteger;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import com.datastax.cdm.feature.TrackRun;
+import com.datastax.cdm.feature.TrackRun.RUN_TYPE;
+import com.datastax.cdm.job.SplitPartitions;
+import com.datastax.cdm.job.SplitPartitions.Partition;
+import com.datastax.oss.driver.api.core.CqlSession;
+import com.datastax.oss.driver.api.core.cql.BoundStatement;
+import com.datastax.oss.driver.api.core.cql.ResultSet;
+
+public class TargetUpsertRunDetailsStatement {
+	private CqlSession session;
+	private String keyspaceName;
+	private String tableName;
+	private long runId;
+	private long prevRunId;
+	private BoundStatement boundInitInfoStatement;
+	private BoundStatement boundInitStatement;
+	private BoundStatement boundUpdateInfoStatement;
+	private BoundStatement boundUpdateStatement;
+	private BoundStatement boundUpdateStartStatement;
+	private BoundStatement boundSelectStatement;
+
+	public TargetUpsertRunDetailsStatement(CqlSession session, String keyspaceTable) {
+		this.session = session;
+		String[] ksTab = keyspaceTable.split("\\.");
+		this.keyspaceName = ksTab[0];
+		this.tableName = ksTab[1];
+		String cdmKsTabInfo = this.keyspaceName + ".cdm_run_info";
+		String cdmKsTabDetails = this.keyspaceName + ".cdm_run_details";
+
+		this.session.execute("create table if not exists " + cdmKsTabInfo
+				+ " (table_name text, run_id bigint, run_type text, prev_run_id bigint, start_time timestamp, end_time timestamp, run_info text, primary key (table_name, run_id))");
+		this.session.execute("create table if not exists " + cdmKsTabDetails
+				+ " (table_name text, run_id bigint, start_time timestamp, token_min bigint, token_max bigint, status text, primary key ((table_name, run_id), token_min))");
+
+		boundInitInfoStatement = bindStatement("INSERT INTO " + cdmKsTabInfo
+				+ " (table_name, run_id, run_type, prev_run_id, start_time) VALUES (?, ?, ?, ?, dateof(now()))");
+		boundInitStatement = bindStatement("INSERT INTO " + cdmKsTabDetails
+				+ " (table_name, run_id, token_min, token_max, status) VALUES (?, ?, ?, ?, ?)");
+		boundUpdateInfoStatement = bindStatement("UPDATE " + cdmKsTabInfo
+				+ " SET end_time = dateof(now()), run_info = ? WHERE table_name = ? AND run_id = ?");
+		boundUpdateStatement = bindStatement(
+				"UPDATE " + cdmKsTabDetails + " SET status = ? WHERE table_name = ? AND run_id = ? AND token_min = ?");
+		boundUpdateStartStatement = bindStatement("UPDATE " + cdmKsTabDetails
+				+ " SET start_time = dateof(now()), status = ? WHERE table_name = ? AND run_id = ? AND token_min = ?");
+		boundSelectStatement = bindStatement("SELECT token_min, token_max FROM " + cdmKsTabDetails
+				+ " WHERE table_name = ? AND run_id = ? and status in ('NOT_STARTED', 'STARTED', 'FAIL', 'DIFF') ALLOW FILTERING");
+	}
+
+	public Collection<SplitPartitions.Partition> getPendingPartitions(long prevRunId) {
+		this.prevRunId = prevRunId;
+		if (prevRunId == 0) {
+			return new ArrayList<SplitPartitions.Partition>();
+		}
+
+		final Collection<SplitPartitions.Partition> pendingParts = new ArrayList<SplitPartitions.Partition>();
+		ResultSet rs = session
+				.execute(boundSelectStatement.setString("table_name", tableName).setLong("run_id", prevRunId));
+		rs.forEach(row -> {
+			Partition part = new Partition(BigInteger.valueOf(row.getLong("token_min")),
+					BigInteger.valueOf(row.getLong("token_max")));
+			pendingParts.add(part);
+		});
+
+		return pendingParts;
+	}
+
+	public long initCdmRun(Collection<SplitPartitions.Partition> parts, RUN_TYPE runType) {
+		runId = System.currentTimeMillis();
+		session.execute(boundInitInfoStatement.setString("table_name", tableName).setLong("run_id", runId)
+				.setString("run_type", runType.toString()).setLong("prev_run_id", prevRunId));
+		parts.forEach(part -> initCdmRun(part));
+		return runId;
+	}
+
+	private void initCdmRun(Partition partition) {
+		session.execute(boundInitStatement.setString("table_name", tableName).setLong("run_id", runId)
+				.setLong("token_min", partition.getMin().longValue())
+				.setLong("token_max", partition.getMax().longValue())
+				.setString("status", TrackRun.RUN_STATUS.NOT_STARTED.toString()));
+	}
+
+	public void updateCdmRunInfo(String runInfo) {
+		session.execute(boundUpdateInfoStatement.setString("table_name", tableName).setLong("run_id", runId)
+				.setString("run_info", runInfo));
+	}
+
+	public void updateCdmRun(BigInteger min, TrackRun.RUN_STATUS status) {
+		if (TrackRun.RUN_STATUS.STARTED.equals(status)) {
+			session.execute(boundUpdateStartStatement.setString("table_name", tableName).setLong("run_id", runId)
+					.setLong("token_min", min.longValue()).setString("status", status.toString()));
+		} else {
+			session.execute(boundUpdateStatement.setString("table_name", tableName).setLong("run_id", runId)
+					.setLong("token_min", min.longValue()).setString("status", status.toString()));
+		}
+	}
+
+	private BoundStatement bindStatement(String stmt) {
+		if (null == session)
+			throw new RuntimeException("Session is not set");
+		return session.prepare(stmt).bind().setTimeout(Duration.ofSeconds(10));
+	}
+
+}
@@ -19,7 +19,6 @@
 import com.datastax.cdm.schema.CqlTable;
 import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.Row;
-import com.datastax.cdm.properties.KnownProperties;
 import com.datastax.cdm.properties.PropertyHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -32,7 +32,6 @@ public enum Diff {
     private Row originRow;
     private Row targetRow;
     private CompletionStage<AsyncResultSet> targetFutureRow;
-    private Diff diff = Diff.UNKNOWN;
 
     public Record(EnhancedPK pk, Row originRow, Row targetRow, CompletionStage<AsyncResultSet> targetFutureRow) {
         if (null == pk || (null == originRow && null == targetRow && null == targetFutureRow)) {
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ private CqlSession initSession(PropertyHelper propertyHelper, CqlSession session`
`98`	`98`	`GenericType<?> javaType = codec.getJavaType();`
`99`	`99`	`if (logDebug) logger.debug("Registering Codec {} for CQL type {} and Java type {}", codec.getClass().getSimpleName(), dataType, javaType);`
`100`	`100`	`try {`
`101`		`- TypeCodec<?> existingCodec = registry.codecFor(dataType, javaType);`
	`101`	`+ registry.codecFor(dataType, javaType);`
`102`	`102`	`} catch (CodecNotFoundException e) {`
`103`	`103`	`registry.register(codec);`
`104`	`104`	`}`