Validation (diff) helper script - automate diff of a user-set percent of data volume (typically after a data migration).

pravinbhat · pravinbhat · commit dfc18b6d04d9 · 2022-06-29T11:18:54.000-04:00
diff --git a/src/resources/diff_data.sh b/src/resources/diff_data.sh
@@ -0,0 +1,61 @@
+#! /bin/bash
+
+########################################################################################################################
+#
+# This script can be used to find differences in a certain percent of data (typically after a data migration) between
+# two Cassandra Clusters (including Astra). This script will divide the Cassandra token range into 200 smaller slices
+# that add up to the user-defined percent.
+#   e.g. If you decide to perform a 2% diff on a table with 1 billion rows, this script will perform 200 smaller diffs
+#   of 100K slice each. Total diff volume will be 100K * 200 i.e. 20 million i.e. 2% of 1 billion.
+#
+# Before running the script, update the below params
+#        SPARK_SUBMIT - Path to the spark-submit command
+#        PROPS_FILE - Path to the spark configuration for the table
+#        VALIDATE_PERCENT - Int value between 1 and 20 - Percent of data to be validated
+#
+# Run this script using nohup in background using a logfile and tail the logfile to monitor progress
+# e.g.  nohup ./diff_data.sh > logs/spark/diff_data.out &
+#
+# To summarise the results after migration, you could use the below command
+# egrep "Running DiffData for Partition Range|Job Final Read Record Count|Job Final Read Valid Count" logs/spark/diff_data.out
+#
+# To validate results, run below command. Result should be 0 or close to 0, if not, find the cause of diff
+# grep "ERROR DiffJobSession" logs/spark/diff_data.out | wc -l
+#
+########################################################################################################################
+
+# Path to spark-submit
+SPARK_SUBMIT=/home/ubuntu/spark-2.4.8-bin-hadoop2.6/bin/spark-submit
+
+# Path to spark configuration for the table
+PROPS_FILE=/home/ubuntu/sparkConf.properties
+
+# Set the percent (between 1 to 20) of data to be validated - value over 20 is not advised
+VALIDATE_PERCENT=1
+
+# ** DO NOT CHANGE ANYTHING BELOW THIS **
+
+# Starting partition token
+MIN_TOKEN=-9223372036854775808
+MAX_TOKEN=9223372036854775807
+S_IDX=$MIN_TOKEN
+
+# Slice unit - this will create around 200 units within Cassandra token range (between min-long to max-long)
+SLICE_UNIT=$(( $MAX_TOKEN / 100 ))
+
+# Slice to be validated/diff within each of the 200 slice units based on input VALIDATE_PERCENT
+SLICE_DIFF=$(( ( $SLICE_UNIT / 100 ) * $VALIDATE_PERCENT ))
+
+echo "Starting DiffData using $PROPS_FILE !!"
+
+# Validate a percent of partition-token-ranges in progressive slices
+CEIL=$(( $MAX_TOKEN - $SLICE_UNIT ))
+while [ $S_IDX -lt $CEIL ]
+do
+  E_IDX=$(( $S_IDX + $SLICE_DIFF ))
+  echo "Running DiffData for Partition Range $S_IDX to $E_IDX .."
+  $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.DiffData migrate-*.jar
+  S_IDX=$(( $S_IDX + $SLICE_UNIT + 1 ))
+done
+
+echo "Completed DiffData using $PROPS_FILE !!"
diff --git a/src/resources/migrate_data.sh b/src/resources/migrate_data.sh
@@ -2,9 +2,14 @@
 
 ###########################################################################################################################
 #
-# This script can be used to Migrate data between Cassandra Clusters (including Astra) in chunks. It migrates data by
-# partition token ranges sequentially in progressive slices. It also helps to restart migration from a point where the
-# previous run might have stopped/failed for whatever reasons.
+# This script can be used to Migrate data between two Cassandra Clusters (including Astra) in chunks. It migrates data
+# sequentially in progressive token-range slices. It also helps to restart migration from a point where the previous
+# run might have stopped/failed for whatever reasons.
+#
+# Before running the script, update the below params
+#        SPARK_SUBMIT - Path to the spark-submit command
+#        PROPS_FILE - Path to the spark configuration for the table
+#        S_IDX - Change this value only if you want to set a custom starting point (e.g. after a previous incomplete run)
 #
 # *** IMP Note: Run this script using nohup in background using a logfile and tail the logfile to monitor progress ***
 # e.g.  nohup ./migrate_data.sh > logs/spark/migrate_data.out &
@@ -32,24 +37,24 @@ echo "Starting Migration using $PROPS_FILE !!"
 # Migrate initial partition tokens from min-long to -9000000000000000000
 if [ $S_IDX -lt -9000000000000000000 ]
 then
-        E_IDX=-9000000000000000001
-        echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
-        $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
-        S_IDX=-9000000000000000000
+  E_IDX=-9000000000000000001
+  echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
+  $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
+  S_IDX=-9000000000000000000
 fi
 
 # Migrate partition tokens from -9000000000000000000 to 8999999999999999999 in slices of 1000000000000000000
 while [ $S_IDX -lt 9000000000000000000 ]
 do
-        if [ $S_IDX -gt 8223372036854775807 ]
-        then
-                E_IDX=8999999999999999999
-        else
-                E_IDX=$(( $S_IDX + $SLICE ))
-        fi
-        echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
-        $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
-        S_IDX=$(( $E_IDX + 1 ))
+  if [ $S_IDX -gt 8223372036854775807 ]
+  then
+    E_IDX=8999999999999999999
+  else
+    E_IDX=$(( $S_IDX + $SLICE ))
+  fi
+  echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
+  $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
+  S_IDX=$(( $E_IDX + 1 ))
 done
 
 # Migrate final partition tokens from 9000000000000000000 to max-long