Skip to content

Commit 9fd1e22

Browse files
authored
Merge pull request #19 from datastax/feature/async-diffdata
Data Validation performance improvements
2 parents 13d93f2 + ce9e289 commit 9fd1e22

File tree

3 files changed

+47
-33
lines changed

3 files changed

+47
-33
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
<groupId>datastax.astra.migrate</groupId>
55
<artifactId>cassandra-data-migrator</artifactId>
6-
<version>1.9</version>
6+
<version>2.0</version>
77
<packaging>jar</packaging>
88

99
<properties>

src/main/java/datastax/astra/migrate/DiffJobSession.java

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@
22

33
import com.datastax.oss.driver.api.core.ConsistencyLevel;
44
import com.datastax.oss.driver.api.core.CqlSession;
5+
import com.datastax.oss.driver.api.core.cql.AsyncResultSet;
56
import com.datastax.oss.driver.api.core.cql.ResultSet;
67
import com.datastax.oss.driver.api.core.cql.Row;
78
import org.apache.spark.SparkConf;
89
import org.slf4j.Logger;
910
import org.slf4j.LoggerFactory;
1011

1112
import java.math.BigInteger;
12-
import java.util.concurrent.ForkJoinPool;
13+
import java.util.HashMap;
14+
import java.util.Map;
15+
import java.util.concurrent.CompletionStage;
1316
import java.util.concurrent.atomic.AtomicLong;
1417
import java.util.stream.IntStream;
1518
import java.util.stream.StreamSupport;
@@ -51,7 +54,6 @@ public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession as
5154
}
5255

5356
public void getDataAndDiff(BigInteger min, BigInteger max) {
54-
ForkJoinPool customThreadPool = new ForkJoinPool();
5557
logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
5658
int maxAttempts = maxRetries;
5759
for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
@@ -61,27 +63,30 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
6163
ResultSet resultSet = sourceSession.execute(
6264
sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()).setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM));
6365

64-
customThreadPool.submit(() -> {
65-
StreamSupport.stream(resultSet.spliterator(), true).forEach(sRow -> {
66-
readLimiter.acquire(1);
67-
// do not process rows less than writeTimeStampFilter
68-
if (!(writeTimeStampFilter && (getLargestWriteTimeStamp(sRow) < minWriteTimeStampFilter
69-
|| getLargestWriteTimeStamp(sRow) > maxWriteTimeStampFilter))) {
70-
if (readCounter.incrementAndGet() % printStatsAfter == 0) {
71-
printCounts("Current");
72-
}
73-
74-
Row astraRow = astraSession
75-
.execute(selectFromAstra(astraSelectStatement, sRow)).one();
76-
diff(sRow, astraRow);
77-
} else {
78-
readCounter.incrementAndGet();
79-
skippedCounter.incrementAndGet();
66+
Map<Row, CompletionStage<AsyncResultSet>> srcToTargetRowMap = new HashMap<Row, CompletionStage<AsyncResultSet>>();
67+
StreamSupport.stream(resultSet.spliterator(), false).forEach(srcRow -> {
68+
readLimiter.acquire(1);
69+
// do not process rows less than writeTimeStampFilter
70+
if (!(writeTimeStampFilter && (getLargestWriteTimeStamp(srcRow) < minWriteTimeStampFilter
71+
|| getLargestWriteTimeStamp(srcRow) > maxWriteTimeStampFilter))) {
72+
if (readCounter.incrementAndGet() % printStatsAfter == 0) {
73+
printCounts("Current");
8074
}
81-
});
8275

83-
printCounts("Final");
84-
}).get();
76+
CompletionStage<AsyncResultSet> targetRowFuture = astraSession
77+
.executeAsync(selectFromAstra(astraSelectStatement, srcRow));
78+
srcToTargetRowMap.put(srcRow, targetRowFuture);
79+
if (srcToTargetRowMap.size() > 1000) {
80+
diffAndClear(srcToTargetRowMap);
81+
}
82+
} else {
83+
readCounter.incrementAndGet();
84+
skippedCounter.incrementAndGet();
85+
}
86+
});
87+
diffAndClear(srcToTargetRowMap);
88+
89+
printCounts("Final");
8590

8691
retryCount = maxAttempts;
8792
} catch (Exception e) {
@@ -91,7 +96,18 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
9196
}
9297
}
9398

94-
customThreadPool.shutdownNow();
99+
}
100+
101+
private void diffAndClear(Map<Row, CompletionStage<AsyncResultSet>> srcToTargetRowMap) {
102+
for (Row srcRow : srcToTargetRowMap.keySet()) {
103+
try {
104+
Row targetRow = srcToTargetRowMap.get(srcRow).toCompletableFuture().get().one();
105+
diff(srcRow, targetRow);
106+
} catch (Exception e) {
107+
logger.error("Could not perform diff for Key: " + getKey(srcRow), e);
108+
}
109+
}
110+
srcToTargetRowMap.clear();
95111
}
96112

97113
public void printCounts(String finalStr) {
@@ -150,15 +166,13 @@ private void diff(Row sourceRow, Row astraRow) {
150166
private String isDifferent(Row sourceRow, Row astraRow) {
151167
StringBuffer diffData = new StringBuffer();
152168
IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> {
153-
if (!writeTimeStampCols.contains(index)) {
154-
MigrateDataType dataType = selectColTypes.get(index);
155-
Object source = getData(dataType, index, sourceRow);
156-
Object astra = getData(dataType, index, astraRow);
157-
158-
boolean isDiff = dataType.diff(source, astra);
159-
if (isDiff) {
160-
diffData.append(" (Index: " + index + " Source: " + source + " Astra: " + astra + " ) ");
161-
}
169+
MigrateDataType dataType = selectColTypes.get(index);
170+
Object source = getData(dataType, index, sourceRow);
171+
Object astra = getData(dataType, index, astraRow);
172+
173+
boolean isDiff = dataType.diff(source, astra);
174+
if (isDiff) {
175+
diffData.append(" (Index: " + index + " Source: " + source + " Astra: " + astra + " ) ");
162176
}
163177
});
164178

src/resources/sparkConf.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ spark.destination.read.consistency.level LOCAL_QUORUM
1313
spark.destination.keyspaceTable test.a2
1414
spark.destination.autocorrect.missing false
1515
spark.destination.autocorrect.mismatch false
16-
spark.destination.custom.writeTime 0
16+
spark.destination.custom.writeTime 0
1717

1818
spark.maxRetries 10
1919
spark.readRateLimit 20000

0 commit comments

Comments
 (0)