Migration helper script (migrate data sequentially in progressive token-range slices)

pravinbhat · pravinbhat · commit 2ceec35cd0fc · 2022-06-28T09:51:37.000-04:00
diff --git a/src/resources/migrate_data.sh b/src/resources/migrate_data.sh
@@ -0,0 +1,59 @@
+#! /bin/bash
+
+###########################################################################################################################
+#
+# This script can be used to Migrate data between Cassandra Clusters (including Astra) in chunks. It migrates data by
+# partition token ranges sequentially in progressive slices. It also helps to restart migration from a point where the
+# previous run might have stopped/failed for whatever reasons.
+#
+# *** IMP Note: Run this script using nohup in background using a logfile and tail the logfile to monitor progress ***
+# e.g.  nohup ./migrate_data.sh > logs/spark/migrate_data.out &
+#
+# To monitor migration progress, you could use the below command
+# grep "Running Migrate for Partition Range" logs/spark/migrate_data.out
+#
+###########################################################################################################################
+
+# Path to spark-submit
+SPARK_SUBMIT=/home/ubuntu/spark-2.4.8-bin-hadoop2.6/bin/spark-submit
+
+# Path to spark configuration for the table
+PROPS_FILE=/home/ubuntu/sparkConf.properties
+
+# Starting partition token (Default is Min possible value of a Cassandra token - min long value in Java).
+# Change this value only if you want to start from a custom partition token (e.g. when a migrate job failed midway)
+S_IDX=-9223372036854775808
+
+# ** DO NOT CHANGE ANYTHING BELOW THIS **
+SLICE=999999999999999999
+
+echo "Starting Migration using $PROPS_FILE !!"
+
+# Migrate initial partition tokens from min-long to -9000000000000000000
+if [ $S_IDX -lt -9000000000000000000 ]
+then
+        E_IDX=-9000000000000000001
+        echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
+        $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
+        S_IDX=-9000000000000000000
+fi
+
+# Migrate partition tokens from -9000000000000000000 to 8999999999999999999 in slices of 1000000000000000000
+while [ $S_IDX -lt 9000000000000000000 ]
+do
+        if [ $S_IDX -gt 8223372036854775807 ]
+        then
+                E_IDX=8999999999999999999
+        else
+                E_IDX=$(( $S_IDX + $SLICE ))
+        fi
+        echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
+        $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
+        S_IDX=$(( $E_IDX + 1 ))
+done
+
+# Migrate final partition tokens from 9000000000000000000 to max-long
+E_IDX=9223372036854775807
+echo "Running Migrate for Partition Range $S_IDX to 9223372036854775807 .."
+$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
+echo "Completed Migration using $PROPS_FILE !!"