Improved docs, logging & set default max-retry to 0

pravinbhat · msmygit · commit f8d3eb908328 · 2023-02-16T16:23:35.000-05:00
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>datastax.astra.migrate</groupId>
   <artifactId>cassandra-data-migrator</artifactId>
-  <version>3.0.4</version>
+  <version>3.0.5</version>
   <packaging>jar</packaging>
 
   <properties>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -33,7 +33,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
-        batchSize = new Integer(Util.getSparkPropOr(sc, "spark.batchSize", "1"));
+        batchSize = new Integer(Util.getSparkPropOr(sc, "spark.batchSize", "5"));
         fetchSizeInRows = new Integer(Util.getSparkPropOr(sc, "spark.read.fetch.sizeInRows", "1000"));
         printStatsAfter = new Integer(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000"));
         if (printStatsAfter < 1) {
@@ -42,7 +42,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
 
         readLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000")));
         writeLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.writeRateLimit", "40000")));
-        maxRetries = Integer.parseInt(sc.get("spark.maxRetries", "10"));
+        maxRetries = Integer.parseInt(sc.get("spark.maxRetries", "0"));
 
         sourceKeyspaceTable = Util.getSparkProp(sc, "spark.origin.keyspaceTable");
         astraKeyspaceTable = Util.getSparkProp(sc, "spark.target.keyspaceTable");
@@ -88,6 +88,7 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         logger.info("PARAM -- Read Consistency: {}", readConsistencyLevel);
         logger.info("PARAM -- Write Consistency: {}", writeConsistencyLevel);
         logger.info("PARAM -- Write Batch Size: {}", batchSize);
+        logger.info("PARAM -- Max Retries: {}", maxRetries);
         logger.info("PARAM -- Read Fetch Size: {}", fetchSizeInRows);
         logger.info("PARAM -- Source Keyspace Table: {}", sourceKeyspaceTable);
         logger.info("PARAM -- Destination Keyspace Table: {}", astraKeyspaceTable);
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -44,10 +44,9 @@ public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession as
 
     public void getDataAndInsert(BigInteger min, BigInteger max) {
         logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max);
-        int maxAttempts = maxRetries;
         boolean done = false;
-
-        for (int retryCount = 1; retryCount <= maxAttempts && !done; retryCount++) {
+        int maxAttempts = maxRetries + 1;
+        for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) {
             long readCnt = 0;
             long writeCnt = 0;
             long skipCnt = 0;
@@ -156,15 +155,15 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                 skippedCounter.addAndGet(skipCnt);
                 done = true;
             } catch (Exception e) {
-                if (retryCount == maxAttempts) {
+                if (attempts == maxAttempts) {
                     readCounter.addAndGet(readCnt);
                     writeCounter.addAndGet(writeCnt);
                     skippedCounter.addAndGet(skipCnt);
                     errorCounter.addAndGet(readCnt - writeCnt - skipCnt);
                 }
-                logger.error("Error occurred retry#: {}", retryCount, e);
-                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}",
-                        Thread.currentThread().getId(), min, max, retryCount);
+                logger.error("Error occurred during Attempt#: {}", attempts, e);
+                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",
+                        Thread.currentThread().getId(), min, max, attempts);
                 logger.error("Error stats Read#: {}, Wrote#: {}, Skipped#: {}, Error#: {}", readCnt, writeCnt, skipCnt, (readCnt - writeCnt - skipCnt));
             }
         }
@@ -188,7 +187,7 @@ public synchronized void printCounts(boolean isFinal) {
     private int iterateAndClearWriteResults(Collection<CompletionStage<AsyncResultSet>> writeResults, int incrementBy) throws Exception {
         int cnt = 0;
         for (CompletionStage<AsyncResultSet> writeResult : writeResults) {
-            //wait for the writes to complete for the batch. The Retry policy, if defined,  should retry the write on timeouts.
+            //wait for the writes to complete for the batch. The Retry policy, if defined, should retry the write on timeouts.
             writeResult.toCompletableFuture().get().one();
             cnt += incrementBy;
         }
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -55,9 +55,9 @@ public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession as
 
     public void getDataAndDiff(BigInteger min, BigInteger max) {
         logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max);
-        int maxAttempts = maxRetries;
-        for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
-
+        boolean done = false;
+        int maxAttempts = maxRetries + 1;
+        for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) {
             try {
                 // cannot do batching if the writeFilter is greater than 0
                 ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
@@ -86,11 +86,11 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
                     }
                 });
                 diffAndClear(srcToTargetRowMap);
-                retryCount = maxAttempts;
+                done = true;
             } catch (Exception e) {
-                logger.error("Error occurred retry#: {}", retryCount, e);
-                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}",
-                        Thread.currentThread().getId(), min, max, retryCount);
+                logger.error("Error occurred during Attempt#: {}", attempts, e);
+                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",
+                        Thread.currentThread().getId(), min, max, attempts);
             }
         }
 
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -77,9 +77,9 @@ public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkC
 
     public void getData(BigInteger min, BigInteger max) {
         logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max);
-        int maxAttempts = maxRetries;
-        for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
-
+        boolean done = false;
+        int maxAttempts = maxRetries + 1;
+        for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) {
             try {
                 ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ?
                                 min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact())
@@ -138,11 +138,11 @@ public void getData(BigInteger min, BigInteger max) {
                 }
 
                 logger.info("ThreadID: {} Final Read Record Count: {}", Thread.currentThread().getId(), readCounter.get());
-                retryCount = maxAttempts;
+                done = true;
             } catch (Exception e) {
-                logger.error("Error occurred retry#: {}", retryCount, e);
-                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}",
-                        Thread.currentThread().getId(), min, max, retryCount);
+                logger.error("Error occurred during Attempt#: {}", attempts, e);
+                logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}",
+                        Thread.currentThread().getId(), min, max, attempts);
             }
         }
     }
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -1,119 +1,131 @@
+# Origin cluster credentials
 spark.origin.host                                 localhost
 spark.origin.username                             some-username
 spark.origin.password                             some-secret-password
 spark.origin.keyspaceTable                        test.a1
 
+# Target cluster credentials
 spark.target.scb                                  file:///aaa/bbb/secure-connect-enterprise.zip
 spark.target.username                             client-id
 spark.target.password                             client-secret
 spark.target.keyspaceTable                        test.a2
+
+# Add 'missing' rows (during 'Validation') in 'Target' from 'Origin'. N/A for 'Migration'
 spark.target.autocorrect.missing                  false
+# Update 'mismatched' rows (during 'Validation') in 'Target' to match 'Origin'. N/A for 'Migration'
 spark.target.autocorrect.mismatch                 false
 
-spark.maxRetries                                  3
+# Read & Write rate-limits(rows/second). Higher value will improve performance and put more load on cluster
 spark.readRateLimit                               20000
 spark.writeRateLimit                              20000
+
+# Used to split Cassandra token-range into slices and migrate random slices one at a time
+# 10K splits usually works for tables up to 100GB (uncompressed) with balanced token distribution
+# For larger tables, increase the splits relatively i.e. use 100K for a 1TB table
 spark.splitSize                                   10000
-spark.batchSize                                   5
 
+# Use a value of 1 (disable batching) when primary-key and partition-key are same
+# For tables with high avg count of rows/partition, use higher value to improve performance
+spark.batchSize                                   10
+
+# Below 'query' properties are set based on table schema
 spark.query.origin                                partition-key,clustering-key,order-date,amount
 spark.query.origin.partitionKey                   partition-key
 spark.query.target.id                             partition-key,clustering-key
 spark.query.types                                 9,1,4,3
-spark.query.ttl.cols                              2,3
-spark.query.writetime.cols                        2,3
+#############################################################################################################
+# Following are the supported data types and their corresponding [Cassandra data-types]
+#  0: ascii, text, varchar
+#  1: int
+#  2: bigint, counter
+#  3: double
+#  4: timestamp
+#  5: map (separate type by %) - Example: 5%1%0 for map<int, text>
+#  6: list (separate type by %) - Example: 6%0 for list<text>
+#  7: blob
+#  8: set (separate type by %) - Example: 8%0 for set<text>
+#  9: uuid, timeuuid
+# 10: boolean
+# 11: tuple
+# 12: float
+# 13: tinyint
+# 14: decimal
+# 15: date
+# 16: UDT [any user-defined-type created using 'CREATE TYPE']
+# 17: varint
+# 18: time
+# 19: smallint
+# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen<map<int, text>>
+#############################################################################################################
 
-##### ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME) #####
+# ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME)
 #spark.query.target                                partition-key,clustering-key,order-date,amount
 
-################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME DATA BASED ON CQL FILTER #################
-#spark.query.condition
-
-################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % (NOT 100%) DATA   ######################
-#spark.coveragePercent                             10
+# The tool adds TTL & Writetime at row-level (not field-level).
+# The largest TTL & Writetime values are used if multiple indexes are listed (comma separated)
+# Comma separated column indexes from "spark.query.origin" used to find largest TTL or Writetime
+spark.query.ttl.cols                              2,3
+spark.query.writetime.cols                        2,3
 
-#################### ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT #####################
-#spark.printStatsAfter                             100000
+# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE ROWS BASED ON CQL FILTER
+#spark.query.condition
 
-################################# ENABLE ONLY IF IT IS A COUNTER TABLE ######################################
+# ENABLE ONLY IF IT IS A COUNTER TABLE
 #spark.counterTable                                false
 #spark.counterTable.cql
 #spark.counterTable.cql.index                      0
 
-######## ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds) #############
+# ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds)
 #spark.origin.writeTimeStampFilter                 false
 #spark.origin.minWriteTimeStampFilter              0
 #spark.origin.maxWriteTimeStampFilter              4102444800000000
 
-######## ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM  ##############
+# ENABLE ONLY IF retries needed (Retry a slice of token-range if an exception occurs)
+#spark.maxRetries                                  0
+
+# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % OF ROWS (NOT 100%)
+#spark.coveragePercent                             100
+
+# ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT
+#spark.printStatsAfter                             100000
+
+# ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM
 #spark.consistency.read                            LOCAL_QUORUM
 #spark.consistency.write                           LOCAL_QUORUM
 
-############# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException  ##################
+# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException
 #spark.read.fetch.sizeInRows                       1000
 
-############### ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET  ######################
+# ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET
 #spark.target.custom.writeTime                     0
 
-#################### ONLY USE if SKIPPING recs greater than 10MB from Origin needed #########################
+# ENABLE ONLY TO SKIP recs greater than 10MB from Origin (to avoid Astra Guardrail error)
 #spark.fieldGuardraillimitMB                       10
 
-#################### ONLY USE if count of recs greater than 10MB from Origin needed #########################
+# ENABLE ONLY TO count of recs greater than 10MB from Origin needed
 #spark.origin.checkTableforColSize                 false
 #spark.origin.checkTableforColSize.cols            partition-key,clustering-key
 #spark.origin.checkTableforColSize.cols.types      9,1
 
-############################ ONLY USE if needing to filter data from Origin #################################
+# ENABLE ONLY TO filter data from Origin
 #spark.origin.FilterData                           false
 #spark.origin.FilterColumn                         test
 #spark.origin.FilterColumnIndex                    2
 #spark.origin.FilterColumnType                     6%16
 #spark.origin.FilterColumnValue                    test
 
-########################## ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE ####################
+# ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE
 #spark.origin.trustStore.path
 #spark.origin.trustStore.password
 #spark.origin.trustStore.type                     JKS
 #spark.origin.keyStore.path
 #spark.origin.keyStore.password
 #spark.origin.enabledAlgorithms                   TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
 
-####################### ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE #######################
+# ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE
 #spark.target.trustStore.path
 #spark.target.trustStore.password
 #spark.target.trustStore.type                     JKS
 #spark.target.keyStore.path
 #spark.target.keyStore.password
 #spark.target.enabledAlgorithms                   TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
-
-#############################################################################################################
-# Following are the supported data types and their corresponding [Cassandra data-types]
-#  0: ascii, text, varchar
-#  1: int
-#  2: bigint, counter
-#  3: double
-#  4: timestamp
-#  5: map (separate type by %) - Example: 5%1%0 for map<int, text>
-#  6: list (separate type by %) - Example: 6%0 for list<text>
-#  7: blob
-#  8: set (separate type by %) - Example: 8%0 for set<text>
-#  9: uuid, timeuuid
-# 10: boolean
-# 11: tuple
-# 12: float
-# 13: tinyint
-# 14: decimal
-# 15: date
-# 16: UDT [any user-defined-type created using 'CREATE TYPE']
-# 17: varint
-# 18: time
-# 19: smallint
-
-# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen<map<int, text>>
-#
-# "spark.query.ttl.cols" - Comma separated column indexes from "spark.query.origin" used to find largest TTL.
-# "spark.query.writetime.cols" - Comma separated column indexes from "spark.query.origin" used to find largest writetime.
-#  Note: The tool migrates TTL & Writetimes at row-level and not field-level.
-#        Migration will use the largest TTL & Writetimes value per row.
-#
-#############################################################################################################