datastax
diff --git a/‎.github/workflows/snyk-cli-scan-pr.yml
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/snyk-cli-scan-pr.yml
Lines changed: 11 additions & 0 deletions
diff --git a/‎.github/workflows/snyk-cli-scan.yml
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/snyk-cli-scan.yml
Lines changed: 14 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎.snyk
Lines changed: 9 additions & 0 deletions b/‎.snyk
Lines changed: 9 additions & 0 deletions
diff --git a/‎.snyk.ignore.example
Lines changed: 12 additions & 0 deletions b/‎.snyk.ignore.example
Lines changed: 12 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 1 addition & 1 deletion b/‎Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎PERF/cdm-v3.properties
Lines changed: 130 additions & 0 deletions b/‎PERF/cdm-v3.properties
Lines changed: 130 additions & 0 deletions
diff --git a/‎src/resources/sparkConf.properties renamed to ‎PERF/cdm-v4.properties
Lines changed: 55 additions & 20 deletions b/‎src/resources/sparkConf.properties renamed to ‎PERF/cdm-v4.properties
Lines changed: 55 additions & 20 deletions
@@ -0,0 +1,11 @@
+# GitHub Action CI
+# Snyk clean-up when PR is merged/closed
+
+on:
+  pull_request:
+    types:
+      - closed
+    branches:    
+      - main
+  workflow_dispatch:
+
@@ -0,0 +1,14 @@
+# GitHub action CI
+# trigger by:
+#  any push on any protected branch: main, v6.8, releases/**
+#  any PR crteated against any protected branch: main, v6.8, releases/**
+
+on: 
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+env:
+  SNYK_SEVERITY_THRESHOLD_LEVEL: high
@@ -6,3 +6,4 @@ dependency-reduced-pom.xml
 .idea/*
 cassandra-data-migrator.iml
 SIT/local
+*.DS_Store
@@ -0,0 +1,9 @@
+# .snyk
+# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities.
+# See https://docs.snyk.io/scan-cloud-deployment/snyk-infrastructure-as-code/snyk-cli-for-infrastructure-as-code/iac-ignores-using-the-.snyk-policy-file for details.
+version: v1.22.2
+python: '3.7'
+patch: {}
+# ignores vulnerabilities until expiry date; change duration by modifying expiry date
+ignore:
+
@@ -0,0 +1,12 @@
+# .snyk.ignore.example
+# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities.
+version: v1.22.2
+python: '3.7'
+patch: {}
+# ignores vulnerabilities until expiry date; change duration by modifying expiry date
+ignore:
+  SNYK-PYTHON-URLLIB3-1533435:
+    - '*':
+        reason: state your ignore reason here
+        expires: 2030-01-01T00:00:00.000Z
+        created: 2022-03-21T13:19:22.196Z
@@ -25,7 +25,7 @@ ENV MAVEN_HOME /usr/share/maven
 ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2"
 COPY ./src /assets/src
 COPY ./pom.xml /assets/pom.xml
-COPY ./src/resources/sparkConf.properties /assets/
+COPY ./src/resources/cdm.properties /assets/
 COPY ./src/resources/partitions.csv /assets/
 COPY ./src/resources/primary_key_rows.csv /assets/
 COPY ./src/resources/runCommands.txt /assets/
 
@@ -0,0 +1,130 @@
+# Origin cluster credentials (use "host + port" OR "secure-connect-bundle" but not both)
+spark.origin.host                                 cass-origin
+spark.origin.port                                 9042
+#spark.origin.scb                                 file:///aaa/bbb/secure-connect-enterprise.zip
+spark.origin.username                             cassandra
+spark.origin.password                             cassandra
+
+# Target cluster credentials (use "host + port" OR "secure-connect-bundle" but not both)
+spark.target.host                                 cass-target
+#spark.target.port                                9042
+#spark.target.scb                                  file:///aaa/bbb/secure-connect-enterprise.zip
+spark.target.username                             cassandra
+spark.target.password                             cassandra
+
+# Add 'missing' rows (during 'Validation') in 'Target' from 'Origin'. N/A for 'Migration'
+spark.target.autocorrect.missing                  false
+# Update 'mismatched' rows (during 'Validation') in 'Target' to match 'Origin'. N/A for 'Migration'
+spark.target.autocorrect.mismatch                 false
+
+# Read & Write rate-limits(rows/second). Higher value will improve performance and put more load on cluster
+spark.readRateLimit                               5000
+spark.writeRateLimit                              5000
+
+# Used to split Cassandra token-range into slices and migrate random slices one at a time
+# 10K splits usually works for tables up to 100GB (uncompressed) with balanced token distribution
+# For larger tables, test on 1% volume (using param coveragePercent) and increase the number-of-splits as needed
+spark.numSplits                                   10000
+
+# Use a value of 1 (disable batching) when primary-key and partition-key are same
+# For tables with high avg count of rows/partition, use higher value to improve performance
+spark.batchSize                                   10
+
+# ENABLE ONLY IF YOU WANT SOME COLUMNS FROM ORIGIN TO MIGRATE (default auto-detects schema & migrates all columns)
+# COMMA SEPARATED LIST OF COLUMN NAMES (MUST INCLUDE ALL PRIMARY-KEY FIELDS)
+#spark.query.origin                                comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns
+
+# ENABLE ONLY IF COLUMN NAMES ON TARGET ARE DIFFERENT FROM ORIGIN (default assumes target schema to be same as origin)
+#spark.query.target                                comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns
+
+############################### EXAMPLE MAPPING USING A DEMO TABLE ##########################################
+# If the origin table schema is as below
+# CREATE TABLE cycling.cyclist_name (
+#     pk1 uuid,
+#     pk2 date,
+#     cc1 boolean,
+#     firstname text,
+#     middlename text, // You do not want to migrate this column
+#     lastname text,
+#     phones list<text>,
+#     PRIMARY KEY((pk1,pk2),cc1)
+# );
+# then, our origin mapping would look like below
+# spark.query.origin                                pk1,pk2,cc1,firstname,lastname,phones
+#
+# And target table schema is as below
+# CREATE TABLE cycling.cyclist_name (
+#     pk1 uuid,
+#     pk2 date,
+#     cc1 boolean,
+#     fn text, // Column has different name than origin
+#     ln text, // Column has different name than origin
+#     phones list<text>,
+#     PRIMARY KEY((pk1,pk2),cc1)
+# );
+# then, our target mapping would look like below
+# spark.query.target                                pk1,pk2,cc1,fn,ln,phones
+#############################################################################################################
+
+# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE ROWS BASED ON A VALID CQL FILTER
+#spark.query.condition
+
+# ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds)
+#spark.origin.writeTimeStampFilter                 false
+#spark.origin.minWriteTimeStampFilter              0
+#spark.origin.maxWriteTimeStampFilter              4102444800000000
+
+# ENABLE ONLY IF retries needed (Retry a slice of token-range if an exception occurs)
+#spark.maxRetries                                  0
+
+# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % OF ROWS (NOT 100%)
+#spark.coveragePercent                             100
+
+# ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT
+#spark.printStatsAfter                             100000
+
+# ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM
+#spark.consistency.read                            LOCAL_QUORUM
+#spark.consistency.write                           LOCAL_QUORUM
+
+# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException
+#spark.read.fetch.sizeInRows                       1000
+
+# ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET
+#spark.target.writeTime.fixedValue                 0
+
+# ENABLE ONLY IF YOU WANT TO INCREMENT SOURCE WRITETIME VALUE
+# DUPLICATES IN LIST FIELDS: USE THIS WORKAROUND FOR CASSANDRA BUG https://issues.apache.org/jira/browse/CASSANDRA-11368
+#spark.target.writeTime.incrementBy                0
+
+# ONLY USE when running in Guardrail mode to identify large fields
+#spark.guardrail.colSizeInKB                       1024
+
+# ENABLE ONLY TO filter data from Origin
+#spark.origin.FilterData                           false
+#spark.origin.FilterColumn                         test
+#spark.origin.FilterColumnIndex                    2
+#spark.origin.FilterColumnType                     6%16
+#spark.origin.FilterColumnValue                    test
+
+# ONLY USE if SSL is enabled on origin Cassandra/DSE (e.g. Azure Cosmos Cassandra DB)
+#spark.origin.ssl.enabled                          true
+
+# ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE
+#spark.origin.trustStore.path
+#spark.origin.trustStore.password
+#spark.origin.trustStore.type                     JKS
+#spark.origin.keyStore.path
+#spark.origin.keyStore.password
+#spark.origin.enabledAlgorithms                   TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
+
+# ONLY USE if SSL is enabled on target Cassandra/DSE
+#spark.target.ssl.enabled                          true
+
+# ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE
+#spark.target.trustStore.path
+#spark.target.trustStore.password
+#spark.target.trustStore.type                     JKS
+#spark.target.keyStore.path
+#spark.target.keyStore.password
+#spark.target.enabledAlgorithms                   TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
@@ -31,13 +31,13 @@
 #
 # You must set either .host or .scb.
 #-----------------------------------------------------------------------------------------------------------
-spark.cdm.origin.connect.host          localhost
+spark.cdm.origin.connect.host          cass-origin
 spark.cdm.origin.connect.port          9042
 #spark.cdm.origin.connect.scb           file:///aaa/bbb/secure-connect-enterprise.zip
 spark.cdm.origin.connect.username      cassandra
 spark.cdm.origin.connect.password      cassandra
 
-spark.cdm.target.connect.host          localhost
+spark.cdm.target.connect.host          cass-target
 spark.cdm.target.connect.port          9042
 #spark.cdm.target.connect.scb           file:///aaa/bbb/secure-connect-enterprise.zip
 spark.cdm.target.connect.username      cassandra
@@ -53,14 +53,29 @@ spark.cdm.target.connect.password      cassandra
 # Recommended Parameters:
 #  spark.cdm.schema.origin
 #    .column
-#       .ttl.names      : Default is empty. Names from .column.names to be combined using the MAX
-#                           function to determine the TTL of the entire migrated record. Will use target 
-#                           table default when not set. The names cannot include any columns listed in
-#                           partition-key,clustering-key.
-#       .writetime.names: Default is empty. Names from .column.names to be combined using the MAX
-#                           function to determine the TIMESTAMP of the entire migrated record. Will use 
-#                           target table default when not set. The names cannot include any columns
-#                           listed in the primary key e.g. partition-key,clustering-key
+#       .ttl
+#          .automatic     : Default is true, unless .ttl.names is specified. When true, the TTL of the
+#                           target record will be determined by finding the maxiumum TTL of
+#                           all origin columns that can have TTL set (which excludes partition key,
+#                           clustering key, collections/UDT/tuple, and frozen columns). When false, and
+#                           .names is not set, the target record will have the TTL determined by the target
+#                           table configuration.
+#          .names         : Default is empty, meaning they will be determined automatically if that is set
+#                           (see above). Specify a subset of eligible columns that are used to calculate
+#                           the TTL of the target record.
+#       .writetime
+#          .automatic     : Default is true, unless .writetime.names is specified. When true, the WRITETIME of
+#                           the target record will be determined by finding the maxiumum WRITETIME of
+#                           all origin columns that can have WRITETIME set (which excludes partition key,
+#                           clustering key, collections/UDT/tuple, and frozen columns). When false, and
+#                           .names is not set, the target record will have the WRITETIME determined by the target
+#                           table configuration.
+#
+#                           *** Note spark.cdm.transform.custom.writetime overrides this setting ***
+#
+#          .names         : Default is empty, meaning they will be determined automatically if that is set
+#                           (see above). Specify a subset of eligible columns that are used to calculate
+#                           the WRITETIME of the target record.
 #
 # Other Parameters:
 #  spark.cdm.schema.origin
@@ -70,10 +85,12 @@ spark.cdm.target.connect.password      cassandra
 #                           origin_column_name:target_column_name. The list is comma-separated. Only renamed
 #                           columns need to be listed.
 #-----------------------------------------------------------------------------------------------------------
-spark.cdm.schema.origin.keyspaceTable             keyspace_name.table_name
-spark.cdm.schema.origin.column.ttl.names          data_col1,data_col2,...
-spark.cdm.schema.origin.column.writetime.names    data_col1,data_col2,...
-#spark.cdm.schema.origin.column.names.to.target    partition_col1:partition_col_1,partition_col2:partition_col_2,...
+spark.cdm.schema.origin.keyspaceTable                devices.sensor_data
+#spark.cdm.schema.origin.column.ttl.automatic         true
+#spark.cdm.schema.origin.column.ttl.names             data_col1,data_col2,...
+#spark.cdm.schema.origin.column.writetime.automatic   true
+#spark.cdm.schema.origin.column.writetime.names       data_col1,data_col2,...
+#spark.cdm.schema.origin.column.names.to.target       partition_col1:partition_col_1,partition_col2:partition_col_2,...
 
 #===========================================================================================================
 # Details about the Target Schema
@@ -82,7 +99,7 @@ spark.cdm.schema.origin.column.writetime.names    data_col1,data_col2,...
 #  spark.cdm.schema.target
 #    .keyspaceTable       : <keyspace>.<table_name> of the table to be migrated. Table must exist in Target.
 #-----------------------------------------------------------------------------------------------------------
-spark.cdm.schema.target.keyspaceTable             keyspace_name.table_name
+spark.cdm.schema.target.keyspaceTable             devices.sensor_data
 
 #===========================================================================================================
 # Autocorrection parameters allow CDM to correct data differences found between Origin and Target when 
@@ -126,7 +143,7 @@ spark.cdm.autocorrect.mismatch                    false
 #  spark.cdm.perfops
 #    .numParts            : Defaults is 10000. In standard operation, the full token range (-2^63..2^63-1)  
 #                           is divided into a number of parts which will be parallel-processed. You should 
-#                           aim for each part to comprise a total of ≈1-10GB of data to migrate. During 
+#                           aim for each part to comprise a total of â1-10GB of data to migrate. During 
 #                           intial testing, you may want this to be a small number (even 1).
 #    .batchSize           : Defaults is 5. When writing to Target, this comprises the number of records that 
 #                           will be put into an UNLOGGED batch. CDM will tend to work on the same partition 
@@ -158,8 +175,8 @@ spark.cdm.autocorrect.mismatch                    false
 #-----------------------------------------------------------------------------------------------------------
 spark.cdm.perfops.numParts                        10000
 spark.cdm.perfops.batchSize                       5
-spark.cdm.perfops.readRateLimit                   20000
-spark.cdm.perfops.writeRateLimit                  40000
+spark.cdm.perfops.readRateLimit                   5000 
+spark.cdm.perfops.writeRateLimit                  5000
 #spark.cdm.perfops.consistency.read                LOCAL_QUORUM
 #spark.cdm.perfops.consistency.write               LOCAL_QUORUM
 #spark.cdm.perfops.printStatsAfter                 100000
@@ -177,12 +194,18 @@ spark.cdm.perfops.writeRateLimit                  40000
 #                                    MigrateData operation would fail. This parameter allows a crude 
 #                                    constant value to be used in its place, separate from the Constant 
 #                                    Values feature.
-#   .custom.writetime                Default is 0 (diabled). Timestamp value in microseconds to use as the
+#   .custom
+#      .writetime                    Default is 0 (diabled). Timestamp value in microseconds to use as the
 #                                    WRITETIME for the target record. This is useful when the WRITETIME of
 #                                    the record in Origin cannot be determined (such as the only non-key
 #                                    columns are collections). This parameter allows a crude constant value
 #                                    to be used in its place, and overrides
 #                                    .schema.origin.column.writetime.indexes.
+#      .writetime.incrementBy        Default is 0. This is useful when you have a List that is not frozen,
+#                                    and are updating this via the autocorrect feature. Lists are not idempotent,
+#                                    and subsequent UPSERTs would add duplicates to the list. Future versions
+#                                    of CDM may tombstone the previous list, but for now this solution is
+#                                    viable and, crucially, more performant.
 #   .codecs                          Default is empty. A comma-separated list of additional codecs to
 #                                    enable. Current codecs are:
 #                                        INT_STRING                : int stored in a String
@@ -202,7 +225,8 @@ spark.cdm.perfops.writeRateLimit                  40000
 #     .string.zone                 Default is UTC ; Must be in ZoneRulesProvider.getAvailableZoneIds()
 #-----------------------------------------------------------------------------------------------------------
 #spark.cdm.transform.missing.key.ts.replace.value
-#spark.cdm.transform.custom.writetime            0
+#spark.cdm.transform.custom.writetime                  0
+#spark.cdm.transform.custom.writetime.incrementBy      0
 #spark.cdm.transform.codecs
 #spark.cdm.transform.codecs.timestamp.string.format    yyyyMMddHHmmss
 #spark.cdm.transform.codecs.timestamp.string.zone      UTC
@@ -287,6 +311,16 @@ spark.cdm.perfops.writeRateLimit                  40000
 #spark.cdm.feature.explodeMap.target.name.key    my_map_key
 #spark.cdm.feature.explodeMap.target.name.value  my_map_value
 
+#===========================================================================================================
+# Guardrail feature manages records that exceed guardrail checks. The Guardrail job will generate a
+# a report, other jobs will skip records that exceed the guardrail.
+#
+# spark.cdm.feature.guardrail
+#    .colSizeInKB        Default 0, meaning the check is not done. Records with one or more fields that
+#                        exceed this size will be flagged. Note this is kB (base 10), not kiB (base 2).
+#
+#===========================================================================================================
+#spark.cdm.feature.guardrail.colSizeInKB         1000
 
 #===========================================================================================================
 # TLS (SSL) connection parameters, if so configured. Note that Secure Bundles embed these details.
@@ -320,3 +354,4 @@ spark.cdm.perfops.writeRateLimit                  40000
 #spark.cdm.target.connect.tls.keyStore.password
 #spark.cdm.target.connect.tls.enabledAlgorithms    TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
 
+