CDM-67 simplifying rate limit config

mieslep · mieslep · commit ecba10752291 · 2023-05-30T09:11:13.000+01:00
diff --git a/src/main/java/com/datastax/cdm/job/AbstractJobSession.java b/src/main/java/com/datastax/cdm/job/AbstractJobSession.java
@@ -42,17 +42,13 @@ protected AbstractJobSession(CqlSession originSession, CqlSession targetSession,
             printStatsAfter = propertyHelper.getInteger(KnownProperties.PRINT_STATS_AFTER);
         }
 
-        readLimiter = RateLimiter.create(propertyHelper.getInteger(KnownProperties.PERF_LIMIT_READ));
-        writeLimiter = RateLimiter.create(propertyHelper.getInteger(KnownProperties.PERF_LIMIT_WRITE));
-        Integer readLimitTarget = propertyHelper.getInteger(KnownProperties.PERF_LIMIT_READ_TARGET);
-        if (readLimitTarget == null || readLimitTarget < 0) { readLimitTarget = propertyHelper.getInteger(KnownProperties.PERF_LIMIT_READ);}
-        readLimiterTarget = RateLimiter.create(readLimitTarget);
+        originLimiter = RateLimiter.create(propertyHelper.getInteger(KnownProperties.PERF_RATELIMIT_ORIGIN));
+        targetLimiter = RateLimiter.create(propertyHelper.getInteger(KnownProperties.PERF_RATELIMIT_TARGET));
         maxRetries = propertyHelper.getInteger(KnownProperties.MAX_RETRIES);
 
         logger.info("PARAM -- Max Retries: {}", maxRetries);
-        logger.info("PARAM -- ReadRateLimit: {}", readLimiter.getRate());
-        logger.info("PARAM -- WriteRateLimit: {}", writeLimiter.getRate());
-        logger.info("PARAM -- TargetReadRateLimit: {}", readLimiterTarget.getRate());
+        logger.info("PARAM -- Origin Rate Limit: {}", originLimiter.getRate());
+        logger.info("PARAM -- Target Rate Limit: {}", targetLimiter.getRate());
 
         this.originSession = new EnhancedSession(propertyHelper, originSession, true);
         this.targetSession = new EnhancedSession(propertyHelper, targetSession, false);
diff --git a/src/main/java/com/datastax/cdm/job/BaseJobSession.java b/src/main/java/com/datastax/cdm/job/BaseJobSession.java
@@ -24,9 +24,8 @@ public abstract class BaseJobSession {
     // then do the following to set the values as they are only applicable per JVM
     // (hence spark Executor)...
     // Rate = Total Throughput (write/read per sec) / Total Executors
-    protected RateLimiter readLimiter;
-    protected RateLimiter readLimiterTarget;
-    protected RateLimiter writeLimiter;
+    protected RateLimiter originLimiter;
+    protected RateLimiter targetLimiter;
     protected Integer maxRetries = 10;
 
     protected Integer printStatsAfter = 100000;
diff --git a/src/main/java/com/datastax/cdm/job/CopyJobSession.java b/src/main/java/com/datastax/cdm/job/CopyJobSession.java
@@ -76,7 +76,7 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                 Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<>();
 
                 for (Row originRow : resultSet) {
-                    readLimiter.acquire(1);
+                    originLimiter.acquire(1);
                     readCnt++;
                     if (readCnt % printStatsAfter == 0) {
                         printCounts(false);
@@ -98,14 +98,13 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                             }
                         }
 
-                        writeLimiter.acquire(1);
-
                         BoundStatement boundUpsert = bind(r);
                         if (null == boundUpsert) {
                             skipCnt++; // TODO: this previously skipped, why not errCnt?
                             continue;
                         }
 
+                        targetLimiter.acquire(1);
                         writeAsync(writeResults, boundUpsert);
                         unflushedWrites++;
 
@@ -169,7 +168,7 @@ private void flushAndClearWrites(Collection<CompletionStage<AsyncResultSet>> wri
 
     private BoundStatement bind(Record r) {
         if (isCounterTable) {
-            readLimiterTarget.acquire(1);
+            targetLimiter.acquire(1);
             Record targetRecord = targetSelectByPKStatement.getRecord(r.getPk());
             if (null != targetRecord) {
                 r.setTargetRow(targetRecord.getTargetRow());
diff --git a/src/main/java/com/datastax/cdm/job/CopyPKJobSession.java b/src/main/java/com/datastax/cdm/job/CopyPKJobSession.java
@@ -56,6 +56,7 @@ public void getRowAndInsert(SplitPartitions.PKRows rowsList) {
                 return;
             }
 
+            originLimiter.acquire(1);
             Record recordFromOrigin = originSelectByPKStatement.getRecord(pk);
             if (null == recordFromOrigin) {
                 missingCounter.incrementAndGet();
@@ -79,7 +80,7 @@ public void getRowAndInsert(SplitPartitions.PKRows rowsList) {
                 }
             }
 
-            writeLimiter.acquire(1);
+            targetLimiter.acquire(1);
             targetSession.getTargetUpsertStatement().putRecord(record);
             writeCounter.incrementAndGet();
 
diff --git a/src/main/java/com/datastax/cdm/job/DiffJobSession.java b/src/main/java/com/datastax/cdm/job/DiffJobSession.java
@@ -112,7 +112,7 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
 
                 List<Record> recordsToDiff = new ArrayList<>(fetchSizeInRows);
                 StreamSupport.stream(resultSet.spliterator(), false).forEach(originRow -> {
-                    readLimiter.acquire(1);
+                    originLimiter.acquire(1);
                     Record record = new Record(pkFactory.getTargetPK(originRow), originRow, null);
 
                     if (originSelectByPartitionRangeStatement.shouldFilterRecord(record)) {
@@ -132,7 +132,7 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
                                 }
                             }
 
-                            readLimiterTarget.acquire(1);
+                            targetLimiter.acquire(1);
                             CompletionStage<AsyncResultSet> targetResult = targetSelectByPKStatement.getAsyncResult(r.getPk());
 
                             if (null==targetResult) {
@@ -203,7 +203,7 @@ private void diff(Record record) {
 
             //correct data
             if (autoCorrectMissing) {
-                writeLimiter.acquire(1);
+                targetLimiter.acquire(1);
                 targetSession.getTargetUpsertStatement().putRecord(record);
                 correctedMissingCounter.incrementAndGet();
                 logger.error("Inserted missing row in target: {}", record.getPk());
@@ -217,7 +217,7 @@ private void diff(Record record) {
             logger.error("Mismatch row found for key: {} Mismatch: {}", record.getPk(), diffData);
 
             if (autoCorrectMismatch) {
-                writeLimiter.acquire(1);
+                targetLimiter.acquire(1);
                 targetSession.getTargetUpsertStatement().putRecord(record);
                 correctedMismatchCounter.incrementAndGet();
                 logger.error("Corrected mismatch row in target: {}", record.getPk());
diff --git a/src/main/java/com/datastax/cdm/job/GuardrailCheckJobSession.java b/src/main/java/com/datastax/cdm/job/GuardrailCheckJobSession.java
@@ -3,8 +3,6 @@
 import com.datastax.cdm.cql.statement.OriginSelectByPartitionRangeStatement;
 import com.datastax.cdm.data.PKFactory;
 import com.datastax.cdm.data.Record;
-import com.datastax.cdm.feature.Featureset;
-import com.datastax.cdm.feature.Guardrail;
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.*;
 import org.apache.logging.log4j.ThreadContext;
@@ -52,7 +50,7 @@ public void guardrailCheck(BigInteger min, BigInteger max) {
             ResultSet resultSet = originSelectByPartitionRangeStatement.execute(originSelectByPartitionRangeStatement.bind(min, max));
             String checkString;
             for (Row originRow : resultSet) {
-                readLimiter.acquire(1);
+                originLimiter.acquire(1);
                 readCounter.addAndGet(1);
 
                 if (readCounter.get() % printStatsAfter == 0) {
diff --git a/src/main/java/com/datastax/cdm/properties/KnownProperties.java b/src/main/java/com/datastax/cdm/properties/KnownProperties.java
@@ -99,9 +99,8 @@ public enum PropertyType {
 
     public static final String PERF_NUM_PARTS                   = "spark.cdm.perfops.numParts";             // 10000, was spark.splitSize
     public static final String PERF_BATCH_SIZE                  = "spark.cdm.perfops.batchSize";             // 5
-    public static final String PERF_LIMIT_READ                  = "spark.cdm.perfops.readRateLimit";         // 20000
-    public static final String PERF_LIMIT_READ_TARGET           = "spark.cdm.perfops.readRateLimitTarget";   // readRateLimit
-    public static final String PERF_LIMIT_WRITE                 = "spark.cdm.perfops.writeRateLimit";        // 40000
+    public static final String PERF_RATELIMIT_ORIGIN            = "spark.cdm.perfops.ratelimit.origin";      // 20000
+    public static final String PERF_RATELIMIT_TARGET            = "spark.cdm.perfops.ratelimit.target";      // 40000
 
     public static final String READ_CL                          = "spark.cdm.perfops.consistency.read";
     public static final String WRITE_CL                         = "spark.cdm.perfops.consistency.write";
@@ -121,11 +120,10 @@ public enum PropertyType {
         defaults.put(PERF_NUM_PARTS, "10000");
            types.put(PERF_BATCH_SIZE, PropertyType.NUMBER);
         defaults.put(PERF_BATCH_SIZE, "5");
-           types.put(PERF_LIMIT_READ, PropertyType.NUMBER);
-        defaults.put(PERF_LIMIT_READ, "20000");
-           types.put(PERF_LIMIT_READ_TARGET, PropertyType.NUMBER);
-           types.put(PERF_LIMIT_WRITE, PropertyType.NUMBER);
-        defaults.put(PERF_LIMIT_WRITE, "40000");
+           types.put(PERF_RATELIMIT_ORIGIN, PropertyType.NUMBER);
+        defaults.put(PERF_RATELIMIT_ORIGIN, "20000");
+           types.put(PERF_RATELIMIT_TARGET, PropertyType.NUMBER);
+        defaults.put(PERF_RATELIMIT_TARGET, "40000");
 
            types.put(READ_CL, PropertyType.STRING);
         defaults.put(READ_CL, "LOCAL_QUORUM");
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -147,15 +147,13 @@ spark.cdm.autocorrect.mismatch                    false
 #                           at a time so if your partition sizes are larger, this number may be increased. 
 #                           If .batchSize would mean that more than 1 partition is often contained in a batch,  
 #                           the figure should be reduced. Ideally < 1% of batches have more than 1 partition.
-#    .readRateLimit       : Defaults to 20000. Concurrent number of records that may will be read across 
-#                           all parallel threads from Origin. This may be adjusted up (or down), depending on
-#                           the amount of data and the processing capacity of the Origin cluster.
-#    .readRateLimitTarget : Defaults to readRateLimit. Concurrent number of records that may will be read across
-#                           all parallel threads from Target. This may be adjusted up (or down), depending on
-#                           the amount of data and the processing capacity of the Target cluster.
-#    .writeRateLimit      : Defaults to 40000. Concurrent number of records that may will be written across
-#                           all parallel threads. This may be adjusted up (or down), depending on the amount 
-#                           of data and the processing capacity of the Target cluster.
+#    .ratelimit
+#      .origin            : Defaults to 20000. Concurrent number of operations across all parallel threads
+#                           from Origin. This may be adjusted up (or down), depending on the amount of data
+#                           and the processing capacity of the Origin cluster.
+#      .target            : Defaults to 40000. Concurrent number of operations across all parallel threads
+#                           from Target. This may be adjusted up (or down), depending on the amount of data
+#                           and the processing capacity of the Target cluster.
 #
 # Other Parameters:
 #  spark.cdm.perfops
@@ -175,9 +173,8 @@ spark.cdm.autocorrect.mismatch                    false
 #-----------------------------------------------------------------------------------------------------------
 spark.cdm.perfops.numParts                        10000
 spark.cdm.perfops.batchSize                       5
-spark.cdm.perfops.readRateLimit                   20000
-#spark.cdm.perfops.readRateLimitTarget             20000
-spark.cdm.perfops.writeRateLimit                  40000
+spark.cdm.perfops.ratelimit.origin                20000
+spark.cdm.perfops.ratelimit.target                40000
 #spark.cdm.perfops.consistency.read                LOCAL_QUORUM
 #spark.cdm.perfops.consistency.write               LOCAL_QUORUM
 #spark.cdm.perfops.printStatsAfter                 100000

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ public void getRowAndInsert(SplitPartitions.PKRows rowsList) {`
`56`	`56`	`return;`
`57`	`57`	`}`
`58`	`58`
	`59`	`+ originLimiter.acquire(1);`
`59`	`60`	`Record recordFromOrigin = originSelectByPKStatement.getRecord(pk);`
`60`	`61`	`if (null == recordFromOrigin) {`
`61`	`62`	`missingCounter.incrementAndGet();`
`@@ -79,7 +80,7 @@ public void getRowAndInsert(SplitPartitions.PKRows rowsList) {`
`79`	`80`	`}`
`80`	`81`	`}`
`81`	`82`
`82`		`- writeLimiter.acquire(1);`
	`83`	`+ targetLimiter.acquire(1);`
`83`	`84`	`targetSession.getTargetUpsertStatement().putRecord(record);`
`84`	`85`	`writeCounter.incrementAndGet();`
`85`	`86`