Merge pull request #130 from datastax/docs_update

msmygit · web-flow · commit 1d00d8867026 · 2023-04-28T14:29:31.000-04:00
Docs update
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @datastax/cdm-core
diff --git a/README.md b/README.md
@@ -97,6 +97,14 @@ When running in above mode the tool assumes a `partitions.csv` file to be presen
 ```
 This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
 
+> **Note:**
+> Here is a quick tip to prepare `partitions.csv` from the log file,
+
+```
+grep "ERROR CopyJobSession: Error with PartitionRange" /path/to/logfile_name.txt | awk '{print $13","$15}' > partitions.csv
+```
+
+
 # Perform large-field Guardrail violation checks
 - The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class datastax.astra.migrate.Guardrail` as shown below
 ```
diff --git a/src/resources/cdm.properties b/src/resources/cdm.properties
@@ -74,6 +74,23 @@ spark.batchSize                                   10
 #spark.counterTable.cql
 #spark.counterTable.cql.index                      0
 
+############################### EXAMPLE MAPPING USING A DEMO counter column TABLE ###########################
+# CREATE TABLE cycling.cyclist_count (
+#     pk1 uuid,
+#     pk2 date,
+#     cc1 boolean,
+#     c1  counter,
+#     PRIMARY KEY((pk1,pk2),cc1)
+# );
+# then, our counter table mapping would look like below,
+# spark.counterTable             true
+# spark.counterTable.cql         UPDATE cycling.cyclist_count SET c1 += ? WHERE pk1 = ? AND pk2 = ? AND cc1 = ?
+# spark.counterTable.cql.index   3,0,1,2
+#
+# Remember the above count index order is based on the below column mapping ordering,
+# spark.query.origin             pk1,pk2,cc1,c
+#############################################################################################################
+
 # ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds)
 #spark.origin.writeTimeStampFilter                 false
 #spark.origin.minWriteTimeStampFilter              0
diff --git a/src/resources/runCommands.txt b/src/resources/runCommands.txt
@@ -4,16 +4,16 @@ curl -OL https://downloads.datastax.com/enterprise/cqlsh-astra.tar.gz
 wget https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
 
 // Migrate
-spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar
-spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar &> table_out.log
+spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar &> log_name_$(date +%Y%m%d_%H_%M).log
+spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar &> table_out_$(date +%Y%m%d_%H_%M).log
 // If target keyspace oand/or table name is different than origin, then add --conf spark.target.keyspaceTable="keyspace2.table2"
 // Add option --verbose for verbose output
 
 // Random Partitioner Run Command
-spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --conf spark.origin.minPartition=-1 --conf spark.origin.maxPartition=170141183460469231731687303715884105728 --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar
+spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --conf spark.origin.minPartition=-1 --conf spark.origin.maxPartition=170141183460469231731687303715884105728 --class datastax.astra.migrate.Migrate /<path>/cassandra-data-migrator-3.4.*.jar &> log_name_$(date +%Y%m%d_%H_%M).log
 
 // Validate
-spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.DiffData /<path>/cassandra-data-migrator-3.4.*.jar &> table_out.log
+spark-submit --properties-file /<path>/cdm.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.DiffData /<path>/cassandra-data-migrator-3.4.*.jar &> table_out_$(date +%Y%m%d_%H_%M).log
 
 // Guardrail check (identify large fields)
-spark-submit --properties-file /<path>/cdmGuardrail.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Guardrail /<path>/cassandra-data-migrator-3.4.*.jar &> table_out.log
+spark-submit --properties-file /<path>/cdmGuardrail.properties --conf spark.origin.keyspaceTable="keyspace.table" --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Guardrail /<path>/cassandra-data-migrator-3.4.*.jar &> table_out_$(date +%Y%m%d_%H_%M).log