Skip to content

Commit dfc18b6

Browse files
committed
Validation (diff) helper script - automate diff of a user-set percent of data volume (typically after a data migration).
1 parent 2ceec35 commit dfc18b6

File tree

2 files changed

+82
-16
lines changed

2 files changed

+82
-16
lines changed

src/resources/diff_data.sh

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#! /bin/bash
2+
3+
########################################################################################################################
4+
#
5+
# This script can be used to find differences in a certain percent of data (typically after a data migration) between
6+
# two Cassandra Clusters (including Astra). This script will divide the Cassandra token range into 200 smaller slices
7+
# that add up to the user-defined percent.
8+
# e.g. If you decide to perform a 2% diff on a table with 1 billion rows, this script will perform 200 smaller diffs
9+
# of 100K slice each. Total diff volume will be 100K * 200 i.e. 20 million i.e. 2% of 1 billion.
10+
#
11+
# Before running the script, update the below params
12+
# SPARK_SUBMIT - Path to the spark-submit command
13+
# PROPS_FILE - Path to the spark configuration for the table
14+
# VALIDATE_PERCENT - Int value between 1 and 20 - Percent of data to be validated
15+
#
16+
# Run this script using nohup in background using a logfile and tail the logfile to monitor progress
17+
# e.g. nohup ./diff_data.sh > logs/spark/diff_data.out &
18+
#
19+
# To summarise the results after migration, you could use the below command
20+
# egrep "Running DiffData for Partition Range|Job Final Read Record Count|Job Final Read Valid Count" logs/spark/diff_data.out
21+
#
22+
# To validate results, run below command. Result should be 0 or close to 0, if not, find the cause of diff
23+
# grep "ERROR DiffJobSession" logs/spark/diff_data.out | wc -l
24+
#
25+
########################################################################################################################
26+
27+
# Path to spark-submit
28+
SPARK_SUBMIT=/home/ubuntu/spark-2.4.8-bin-hadoop2.6/bin/spark-submit
29+
30+
# Path to spark configuration for the table
31+
PROPS_FILE=/home/ubuntu/sparkConf.properties
32+
33+
# Set the percent (between 1 to 20) of data to be validated - value over 20 is not advised
34+
VALIDATE_PERCENT=1
35+
36+
# ** DO NOT CHANGE ANYTHING BELOW THIS **
37+
38+
# Starting partition token
39+
MIN_TOKEN=-9223372036854775808
40+
MAX_TOKEN=9223372036854775807
41+
S_IDX=$MIN_TOKEN
42+
43+
# Slice unit - this will create around 200 units within Cassandra token range (between min-long to max-long)
44+
SLICE_UNIT=$(( $MAX_TOKEN / 100 ))
45+
46+
# Slice to be validated/diff within each of the 200 slice units based on input VALIDATE_PERCENT
47+
SLICE_DIFF=$(( ( $SLICE_UNIT / 100 ) * $VALIDATE_PERCENT ))
48+
49+
echo "Starting DiffData using $PROPS_FILE !!"
50+
51+
# Validate a percent of partition-token-ranges in progressive slices
52+
CEIL=$(( $MAX_TOKEN - $SLICE_UNIT ))
53+
while [ $S_IDX -lt $CEIL ]
54+
do
55+
E_IDX=$(( $S_IDX + $SLICE_DIFF ))
56+
echo "Running DiffData for Partition Range $S_IDX to $E_IDX .."
57+
$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.DiffData migrate-*.jar
58+
S_IDX=$(( $S_IDX + $SLICE_UNIT + 1 ))
59+
done
60+
61+
echo "Completed DiffData using $PROPS_FILE !!"

src/resources/migrate_data.sh

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22

33
###########################################################################################################################
44
#
5-
# This script can be used to Migrate data between Cassandra Clusters (including Astra) in chunks. It migrates data by
6-
# partition token ranges sequentially in progressive slices. It also helps to restart migration from a point where the
7-
# previous run might have stopped/failed for whatever reasons.
5+
# This script can be used to Migrate data between two Cassandra Clusters (including Astra) in chunks. It migrates data
6+
# sequentially in progressive token-range slices. It also helps to restart migration from a point where the previous
7+
# run might have stopped/failed for whatever reasons.
8+
#
9+
# Before running the script, update the below params
10+
# SPARK_SUBMIT - Path to the spark-submit command
11+
# PROPS_FILE - Path to the spark configuration for the table
12+
# S_IDX - Change this value only if you want to set a custom starting point (e.g. after a previous incomplete run)
813
#
914
# *** IMP Note: Run this script using nohup in background using a logfile and tail the logfile to monitor progress ***
1015
# e.g. nohup ./migrate_data.sh > logs/spark/migrate_data.out &
@@ -32,24 +37,24 @@ echo "Starting Migration using $PROPS_FILE !!"
3237
# Migrate initial partition tokens from min-long to -9000000000000000000
3338
if [ $S_IDX -lt -9000000000000000000 ]
3439
then
35-
E_IDX=-9000000000000000001
36-
echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
37-
$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
38-
S_IDX=-9000000000000000000
40+
E_IDX=-9000000000000000001
41+
echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
42+
$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
43+
S_IDX=-9000000000000000000
3944
fi
4045

4146
# Migrate partition tokens from -9000000000000000000 to 8999999999999999999 in slices of 1000000000000000000
4247
while [ $S_IDX -lt 9000000000000000000 ]
4348
do
44-
if [ $S_IDX -gt 8223372036854775807 ]
45-
then
46-
E_IDX=8999999999999999999
47-
else
48-
E_IDX=$(( $S_IDX + $SLICE ))
49-
fi
50-
echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
51-
$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
52-
S_IDX=$(( $E_IDX + 1 ))
49+
if [ $S_IDX -gt 8223372036854775807 ]
50+
then
51+
E_IDX=8999999999999999999
52+
else
53+
E_IDX=$(( $S_IDX + $SLICE ))
54+
fi
55+
echo "Running Migrate for Partition Range $S_IDX to $E_IDX .."
56+
$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.migrate.source.minPartition=$S_IDX --conf spark.migrate.source.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate migrate-*.jar
57+
S_IDX=$(( $E_IDX + 1 ))
5358
done
5459

5560
# Migrate final partition tokens from 9000000000000000000 to max-long

0 commit comments

Comments
 (0)