Skip to content

Commit befca05

Browse files
authored
Merge pull request #12 from datastax/feature/ttl-writetime-fix
Feature/ttl writetime fix
2 parents 307c670 + 9667f17 commit befca05

File tree

4 files changed

+34
-20
lines changed

4 files changed

+34
-20
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Spark jobs in this repo can be used for data migration and data validation.
44

5-
> :warning: Please note this job has been tested with spark version [2.4.8](https://downloads.apache.org/spark/spark-2.4.8/)
5+
> :warning: Please note this job has been tested with spark version [2.4.8](https://archive.apache.org/dist/spark/spark-2.4.8/)
66
77
## Prerequisite
88

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
<groupId>datastax.astra.migrate</groupId>
55
<artifactId>cassandra-data-migrator</artifactId>
6-
<version>1.5</version>
6+
<version>1.6</version>
77
<packaging>jar</packaging>
88

99
<properties>

src/main/java/datastax/astra/migrate/AbstractJobSession.java

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
public class AbstractJobSession extends BaseJobSession {
1818

1919
public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
20-
20+
2121
protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
2222
this.sourceSession = sourceSession;
2323
this.astraSession = astraSession;
@@ -37,25 +37,26 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
3737

3838
isPreserveTTLWritetime = Boolean.parseBoolean(sparkConf.get("spark.preserveTTLWriteTime", "false"));
3939
if (isPreserveTTLWritetime) {
40-
String ttlColsStr = sparkConf.get("spark.source.ttl.cols");
40+
String ttlColsStr = sparkConf.get("spark.preserveTTLWriteTime.ttl.cols");
4141
if (null != ttlColsStr && ttlColsStr.trim().length() > 0) {
4242
for (String ttlCol : ttlColsStr.split(",")) {
4343
ttlCols.add(Integer.parseInt(ttlCol));
4444
}
4545
}
46+
47+
String writeTimestampColsStr = sparkConf.get("spark.preserveTTLWriteTime.writetime.cols");
48+
if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
49+
for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
50+
writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
51+
}
52+
}
4653
}
4754

4855
writeTimeStampFilter = Boolean
4956
.parseBoolean(sparkConf.get("spark.source.writeTimeStampFilter", "false"));
5057
// batchsize set to 1 if there is a writeFilter
5158
if (writeTimeStampFilter) {
5259
batchSize = 1;
53-
String writeTimestampColsStr = sparkConf.get("spark.source.writeTimeStampFilter.cols");
54-
if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
55-
for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
56-
writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
57-
}
58-
}
5960
}
6061

6162
String minWriteTimeStampFilterStr =
@@ -82,14 +83,27 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
8283

8384
String selectCols = sparkConf.get("spark.query.source");
8485
String partionKey = sparkConf.get("spark.query.source.partitionKey");
86+
String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
87+
88+
final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
89+
if (isPreserveTTLWritetime) {
90+
String[] allCols = selectCols.split(",");
91+
ttlCols.forEach(col -> {
92+
selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")");
93+
});
94+
writeTimeStampCols.forEach(col -> {
95+
selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")");
96+
});
97+
}
98+
String fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols.toString() + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
99+
+ ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING";
100+
sourceSelectStatement = sourceSession.prepare(fullSelectQuery);
101+
logger.info("PARAM -- Query used: " + fullSelectQuery);
102+
85103
selectColTypes = getTypes(sparkConf.get("spark.query.types"));
86104
String idCols = sparkConf.get("spark.query.destination.id", "");
87105
idColTypes = selectColTypes.subList(0, idCols.split(",").length);
88-
String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
89-
sourceSelectStatement = sourceSession.prepare(
90-
"select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
91-
+ ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING");
92-
106+
93107
String insertCols = sparkConf.get("spark.query.destination", "");
94108
if (null == insertCols || insertCols.trim().isEmpty()) {
95109
insertCols = selectCols;
@@ -146,15 +160,15 @@ public List<MigrateDataType> getTypes(String types) {
146160
public int getLargestTTL(Row sourceRow) {
147161
int ttl = 0;
148162
for (Integer ttlCol : ttlCols) {
149-
ttl = Math.max(ttl, sourceRow.getInt(ttlCol));
163+
ttl = Math.max(ttl, sourceRow.getInt(selectColTypes.size() + ttlCol - 1));
150164
}
151165
return ttl;
152166
}
153167

154168
public long getLargestWriteTimeStamp(Row sourceRow) {
155169
long writeTimestamp = 0;
156170
for (Integer writeTimeStampCol : writeTimeStampCols) {
157-
writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(writeTimeStampCol));
171+
writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(selectColTypes.size() + ttlCols.size() + writeTimeStampCol - 1));
158172
}
159173
return writeTimestamp;
160174
}

src/resources/sparkConf.properties

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ spark.batchSize 5
2222
spark.coveragePercent 100
2323
spark.printStatsAfter 100000
2424

25-
spark.query.source partition-key,clustering-key,order-date,amount,writetime(order-date),writetime(amount),ttl(order-date),ttl(amount)
25+
spark.query.source partition-key,clustering-key,order-date,amount
2626
spark.query.source.partitionKey partition-key
2727
spark.query.destination partition-key,clustering-key,order-date,amount
2828
spark.query.destination.id partition-key,clustering-key
@@ -33,10 +33,10 @@ spark.counterTable.cql
3333
spark.counterTable.cql.index 0
3434

3535
spark.preserveTTLWriteTime true
36-
spark.source.ttl.cols 6,7
36+
spark.preserveTTLWriteTime.ttl.cols 2,3
37+
spark.preserveTTLWriteTime.writetime.cols 2,3
3738

3839
spark.source.writeTimeStampFilter false
39-
spark.source.writeTimeStampFilter.cols 4,5
4040
spark.source.minWriteTimeStampFilter 0
4141
spark.source.maxWriteTimeStampFilter 9223372036854775807
4242

0 commit comments

Comments
 (0)