Merge pull request #162 from datastax/feature/CDM-69sit

msmygit · web-flow · commit ed7ba7cf19bd · 2023-06-02T11:00:43.000-04:00
fix issue in handling null values in target PK
diff --git a/Dockerfile b/Dockerfile
@@ -25,7 +25,7 @@ ENV MAVEN_HOME /usr/share/maven
 ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2"
 COPY ./src /assets/src
 COPY ./pom.xml /assets/pom.xml
-COPY ./src/resources/sparkConf.properties /assets/
+COPY ./src/resources/cdm.properties /assets/
 COPY ./src/resources/partitions.csv /assets/
 COPY ./src/resources/primary_key_rows.csv /assets/
 COPY ./src/resources/runCommands.txt /assets/
diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@ Migrate and Validate Tables between Origin and Target Cassandra Clusters.
 > :warning: Please note this job has been tested with spark version [3.3.1](https://archive.apache.org/dist/spark/spark-3.3.1/)
 
 ## Install as a Container
-- Get the latest image that includes all dependencies from [DockerHub](https://hub.docker.com/r/datastax/cassandra-data-migrator) 
-  - All migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) would be available in the `/assets/` folder of the container
+- Get the latest image that includes all dependencies from [DockerHub](https://hub.docker.com/r/datastax/cassandra-data-migrator)
+    - All migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) would be available in the `/assets/` folder of the container
 
 ## Install as a JAR file
 - Download the latest jar file from the GitHub [packages area here](https://github.com/orgs/datastax/packages?repo_name=cassandra-data-migrator)
@@ -26,34 +26,37 @@ tar -xvzf spark-3.3.1-bin-hadoop3.tgz
 
 > :warning: Note that Version 4 of the tool is not backward-compatible with .properties files created in previous versions, and that package names have changed.
 
-1. `sparkConf.properties` file needs to be configured as applicable for the environment. Parameter descriptions and defaults are described in the file.
-   > A sample Spark conf file configuration can be [found here](./src/resources/sparkConf.properties)
-2. Place the conf file where it can be accessed while running the job via spark-submit.
+1. `cdm.properties` file needs to be configured as applicable for the environment. Parameter descriptions and defaults are described in the file.
+   > A sample properties file configuration can be [found here](./src/resources/cdm.properties)
+2. Place the properties file where it can be accessed while running the job via spark-submit.
 3. Run the below job using `spark-submit` command as shown below:
 
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
+--conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
---class datastax.cdm.job.Migrate cassandra-data-migrator-4.x.x.jar &> logfile_name.txt
+--class datastax.cdm.job.Migrate cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
 
-Note: 
+Note:
 - Above command generates a log file `logfile_name.txt` to avoid log output on the console.
 - Add option `--driver-memory 25G --executor-memory 25G` as shown below if the table migrated is large (over 100GB)
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
+--conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" --driver-memory 25G --executor-memory 25G /
---class datastax.cdm.job.Migrate cassandra-data-migrator-4.x.x.jar &> logfile_name.txt
+--class datastax.cdm.job.Migrate cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
 
 # Steps for Data-Validation:
 
 - To run the job in Data validation mode, use class option `--class datastax.cdm.job.DiffData` as shown below
 
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
+--conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
---class datastax.cdm.job.DiffData cassandra-data-migrator-4.x.x.jar &> logfile_name.txt
+--class datastax.cdm.job.DiffData cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
 
 - Validation job will report differences as “ERRORS” in the log file as shown below
@@ -66,10 +69,10 @@ Note:
 ```
 
 - Please grep for all `ERROR` from the output log files to get the list of missing and mismatched records.
-  - Note that it lists differences by primary-key values.
+    - Note that it lists differences by primary-key values.
 - The Validation job can also be run in an AutoCorrect mode. This mode can
-  - Add any missing records from origin to target
-  - Update any mismatched records between origin and target (makes target same as origin). 
+    - Add any missing records from origin to target
+    - Update any mismatched records between origin and target (makes target same as origin).
 - Enable/disable this feature using one or both of the below setting in the config file
 ```
 spark.cdm.autocorrect.missing                     false|true
@@ -81,12 +84,13 @@ Note:
 # Migrating specific partition ranges
 - You can also use the tool to migrate specific partition ranges using class option `--class datastax.cdm.job.MigratePartitionsFromFile` as shown below
 ```
-./spark-submit --properties-file sparkConf.properties /
+./spark-submit --properties-file cdm.properties /
+--conf spark.cdm.schema.origin.keyspaceTable="<keyspace-name>.<table-name>" /
 --master "local[*]" /
---class datastax.cdm.job.MigratePartitionsFromFile cassandra-data-migrator-4.x.x.jar &> logfile_name.txt
+--class datastax.cdm.job.MigratePartitionsFromFile cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
 
-When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder in the below format, where each line (`min,max`) represents a partition-range 
+When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder in the below format, where each line (`min,max`) represents a partition-range
 ```
 -507900353496146534,-107285462027022883
 -506781526266485690,1506166634797362039
@@ -95,11 +99,40 @@ When running in above mode the tool assumes a `partitions.csv` file to be presen
 ```
 This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
 
+> **Note:**
+> Here is a quick tip to prepare `partitions.csv` from the log file,
+
+```
+grep "ERROR CopyJobSession: Error with PartitionRange" /path/to/logfile_name.txt | awk '{print $13","$15}' > partitions.csv
+```
+# Data validation for specific partition ranges
+- You can also use the tool to validate data for a specific partition ranges using class option `--class datastax.cdm.job.DiffPartitionsFromFile` as shown below,
+```
+./spark-submit --properties-file cdm.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
+--master "local[*]" /
+--class datastax.cdm.job.DiffPartitionsFromFile cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
+```
+
+When running in above mode the tool assumes a `partitions.csv` file to be present in the current folder.
+
+# Perform large-field Guardrail violation checks
+- The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class datastax.astra.migrate.Guardrail` as shown below
+```
+./spark-submit --properties-file cdmGuardrail.properties /
+--conf spark.origin.keyspaceTable="<keyspace-name>.<table-name>" /
+--master "local[*]" /
+--class datastax.cdm.job.GuardrailCheck cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
+```
+> A sample Guardrail properties file can be [found here](./src/resources/cdmGuardrail.properties)
+
 # Features
-- Supports migration/validation of [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
+- Auto-detects table schema (column names, types, keys, collections, UDTs, etc.)
+    - Including counter table [Counter tables](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_using/useCountersConcept.html)
 - Preserve [writetimes](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__retrieving-the-datetime-a-write-occurred-p) and [TTLs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/cql_commands/cqlSelect.html#cqlSelect__ref-select-ttl-p)
 - Supports migration/validation of advanced DataTypes ([Sets](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__set), [Lists](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__list), [Maps](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__map), [UDTs](https://docs.datastax.com/en/dse/6.8/cql/cql/cql_reference/refDataTypes.html#refDataTypes__udt))
 - Filter records from `Origin` using `writetimes` and/or CQL conditions and/or min/max token-range
+- Perform guardrail checks (identify large fields)
 - Supports adding `constants` as new columns on `Target`
 - Supports expanding `Map` columns on `Origin` into multiple records on `Target`
 - Fully containerized (Docker and K8s friendly)
@@ -109,6 +142,9 @@ This mode is specifically useful to processes a subset of partition-ranges that
 - Validate migration accuracy and performance using a smaller randomized data-set
 - Supports adding custom fixed `writetime`
 
+# Known Limitations
+- This tool does not migrate `ttl` & `writetime` at the field-level (for optimization reasons). It instead finds the field with the highest `ttl` & the field with the highest `writetime` within an `origin` row and uses those values on the entire `target` row.
+
 # Building Jar for local development
 1. Clone this repo
 2. Move to the repo folder `cd cassandra-data-migrator`
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,22 @@
+# Release Notes
+
+## [4.0.0] - 2023-06-02
+This release is a major code refactor of Cassandra Data Migrator, focused on internal code structure and organization. 
+Automated testing (both unit and integration) was introduced and incorporated into the build process. It includes all
+features of the previous version, but the properties specified within configuration (.properties) file have been 
+re-organized and renamed; therefore, the configuration file from the previous version will not work with this version.
+
+New features were also introduced with this release, on top of the 3.4.5 version.
+### Added
+- New features:
+    - `Column renaming`: Column names can differ between Origin and Target
+    - `Migrate UDTs across keyspaces`: UDTs can be migrated from Origin to Target, even when the keyspace names differ
+    - `Data Type Conversion`: Some predefined Codecs support type conversion between Origin and Target; custom Codecs can be added
+    - `Separate Writetime and TTL configuration`: Writetime columns can differ from TTL columns
+    - `Subset of columns can be specified with Writetime and TTL`: Not all eligible columns need to be used to compute the origin value
+    - `Automatic RandomPartitioner min/max`: Partition min/max values no longer need to be manually configured
+    - `Populate Target columns with constant values`: New columns can be added to the Target table, and populated with constant values
+    - `Explode Origin Map Column into Target rows`: A Map in Origin can be expanded into multiple rows in Target when the Map key is part of the Target primary key
+
+## [3.x.x] 
+Previous releases of the project have not been documented in this file
diff --git a/SIT/regression/04_null_ts_in_pk/cdm.sh b/SIT/regression/04_null_ts_in_pk/cdm.sh
@@ -0,0 +1,24 @@
+#!/bin/bash -e
+
+cat <<EOF
+!!!!!!!!
+!!!!!!!!  Testing Migrate
+!!!!!!!!
+EOF
+
+/local/cdm.sh -c 
+spark-submit \
+  --properties-file /smoke/01_basic_kvp/migrate.properties \
+  --master "local[*]" \
+  --class datastax.astra.migrate.Migrate /local/cassandra-data-migrator.jar 
+
+cat <<EOF
+!!!!!!!!
+!!!!!!!!  Testing DiffData
+!!!!!!!!
+EOF
+
+spark-submit \
+  --properties-file /smoke/01_basic_kvp/migrate.properties \
+  --master "local[*]" \
+  --class datastax.astra.migrate.DiffData /local/cassandra-data-migrator.jar
diff --git a/SIT/regression/04_null_ts_in_pk/cdm.txt b/SIT/regression/04_null_ts_in_pk/cdm.txt
@@ -0,0 +1,2 @@
+migrateData com.datastax.cdm.job.Migrate migrate.properties
+validateData com.datastax.cdm.job.DiffData migrate.properties
diff --git a/SIT/regression/04_null_ts_in_pk/execute.sh b/SIT/regression/04_null_ts_in_pk/execute.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+workingDir="$1"
+cd "$workingDir"
+
+for scenario in $(cat cdm.txt | awk '{print $1}'); do
+  /local/cdm.sh -f cdm.txt -s $scenario -d "$workingDir"
+done
+
diff --git a/SIT/regression/04_null_ts_in_pk/expected.cql b/SIT/regression/04_null_ts_in_pk/expected.cql
@@ -0,0 +1 @@
+SELECT * FROM target.regression_null_ts_in_pk;
diff --git a/SIT/regression/04_null_ts_in_pk/expected.out b/SIT/regression/04_null_ts_in_pk/expected.out
@@ -0,0 +1,7 @@
+
+ key  | ts                              | value
+------+---------------------------------+--------
+ key1 | 2023-06-01 00:00:00.000000+0000 | valueA
+ key2 | 2023-06-02 12:00:00.000000+0000 | valueB
+
+(2 rows)
diff --git a/SIT/regression/04_null_ts_in_pk/migrate.properties b/SIT/regression/04_null_ts_in_pk/migrate.properties
@@ -0,0 +1,11 @@
+spark.cdm.origin.connect.host                     cdm-sit-cass
+spark.cdm.target.connect.host                     cdm-sit-cass
+
+spark.cdm.schema.origin.keyspaceTable             origin.regression_null_ts_in_pk
+spark.cdm.schema.target.keyspaceTable             target.regression_null_ts_in_pk
+spark.cdm.perfops.numParts                        1
+
+spark.cdm.autocorrect.missing                     true
+spark.cdm.autocorrect.mismatch                    true
+
+spark.cdm.transform.missing.key.ts.replace.value  1685577600000
diff --git a/SIT/regression/04_null_ts_in_pk/setup.cql b/SIT/regression/04_null_ts_in_pk/setup.cql
@@ -0,0 +1,7 @@
+DROP TABLE IF EXISTS origin.regression_null_ts_in_pk;
+CREATE TABLE origin.regression_null_ts_in_pk(key text, ts timestamp, value text, PRIMARY KEY (key));
+INSERT INTO origin.regression_null_ts_in_pk(key,value) VALUES ('key1','valueA');
+INSERT INTO origin.regression_null_ts_in_pk(key,ts,value) VALUES ('key2','2023-06-02 12:00:00','valueB');
+
+DROP TABLE IF EXISTS target.regression_null_ts_in_pk;
+CREATE TABLE target.regression_null_ts_in_pk(key text, ts timestamp, value text, PRIMARY KEY (key, ts));
diff --git a/src/main/java/com/datastax/cdm/data/EnhancedPK.java b/src/main/java/com/datastax/cdm/data/EnhancedPK.java
@@ -90,32 +90,8 @@ private void validate() {
             if (null != value) continue;
             if (i==factory.getExplodeMapTargetPKIndex()) continue; // this is an unexploded PK
 
-            // This bit of code addresses the fact we cannot currently insert a NULL value
-            // into a primary key column. So we replace it with an alternate value, or
-            // mark the PK as invalid.
-            this.messages = new ArrayList<>();
-            Class c = classes.get(i);
-            if (Objects.equals(c, String.class)) {
-                values.set(i, factory.getDefaultForMissingString());
-                messages.add(String.format("WARN: Defaulting null string value to the empty string for position %d", i));
-                warningState = true;
-            }
-            else if (Objects.equals(c, Instant.class)) {
-                Long tsReplaceVal = factory.getDefaultForMissingTimestamp();
-                if (null != tsReplaceVal) {
-                    values.set(i, Instant.ofEpochSecond(tsReplaceVal).toString());
-                    messages.add(String.format("WARN: Defaulting null timestamp to %d for position %d", tsReplaceVal, i));
-                    warningState = true;
-                }
-                else {
-                    messages.add(String.format("ERROR: Null value for position %d, consider setting %s", i, KnownProperties.TRANSFORM_REPLACE_MISSING_TS));
-                    errorState = true;
-                }
-            }
-            else {
-                messages.add(String.format("ERROR: Null value for position %d", i));
-                errorState = true;
-            }
+            messages.add(String.format("ERROR: Null value for position %d", i));
+            errorState = true;
         }
     }
 
diff --git a/src/main/java/com/datastax/cdm/data/PKFactory.java b/src/main/java/com/datastax/cdm/data/PKFactory.java
@@ -45,9 +45,6 @@ public enum Side {
     private final Integer explodeMapTargetPKIndex;
     private final ExplodeMap explodeMapFeature;
 
-    // These defaults address the problem where we cannot insert null values into a PK column
-    private final Long defaultForMissingTimestamp;
-    private final String defaultForMissingString;
 
     public PKFactory(PropertyHelper propertyHelper, CqlTable originTable, CqlTable targetTable) {
 
@@ -67,9 +64,6 @@ public PKFactory(PropertyHelper propertyHelper, CqlTable originTable, CqlTable t
             originPKLookupMethods.add(null);
         }
 
-        this.defaultForMissingTimestamp = propertyHelper.getLong(KnownProperties.TRANSFORM_REPLACE_MISSING_TS);
-        this.defaultForMissingString = "";
-
         setOriginColumnLookupMethod(propertyHelper);
         setConstantColumns();
 
@@ -220,14 +214,6 @@ public List<Record> toValidRecordList(Record record) {
         return recordSet;
     }
 
-    protected Long getDefaultForMissingTimestamp() {
-        return defaultForMissingTimestamp;
-    }
-
-    protected String getDefaultForMissingString() {
-        return defaultForMissingString;
-    }
-
     public Integer getExplodeMapTargetPKIndex() {return explodeMapTargetPKIndex;}
 
     private List<Object> getTargetPKValuesFromOriginColumnLookupMethod(Row originRow, List<Object> defaultValues) {
diff --git a/src/main/java/com/datastax/cdm/schema/CqlTable.java b/src/main/java/com/datastax/cdm/schema/CqlTable.java
diff --git a/src/resources/cdm.properties b/src/resources/cdm.properties

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+migrateData com.datastax.cdm.job.Migrate migrate.properties`
	`2`	`+validateData com.datastax.cdm.job.DiffData migrate.properties`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+SELECT * FROM target.regression_null_ts_in_pk;`