Skip to content

Commit c1c75cc

Browse files
Merge branch 'main' into feature/custom_writetime
# Conflicts: # src/main/java/datastax/astra/migrate/AbstractJobSession.java
2 parents caaadaf + befca05 commit c1c75cc

15 files changed

+150
-176
lines changed

.idea/libraries/Maven__log4j_apache_log4j_extras_1_2_17.xml

Lines changed: 0 additions & 13 deletions
This file was deleted.

.idea/libraries/Maven__log4j_log4j_1_2_17.xml

Lines changed: 0 additions & 13 deletions
This file was deleted.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Spark jobs in this repo can be used for data migration and data validation.
44

5-
> :warning: Please note this job has been tested with spark version [2.4.8](https://downloads.apache.org/spark/spark-2.4.8/)
5+
> :warning: Please note this job has been tested with spark version [2.4.8](https://archive.apache.org/dist/spark/spark-2.4.8/)
66
77
## Prerequisite
88

cassandra-data-migrator.iml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@
8383
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.16" level="project" />
8484
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.16" level="project" />
8585
<orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.16" level="project" />
86-
<orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
8786
<orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.7.16" level="project" />
8887
<orderEntry type="library" name="Maven: com.ning:compress-lzf:1.0.3" level="project" />
8988
<orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.8.2" level="project" />
@@ -160,7 +159,6 @@
160159
<orderEntry type="library" name="Maven: org.spark-project.hive:hive-exec:1.2.1.spark2" level="project" />
161160
<orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
162161
<orderEntry type="library" name="Maven: javolution:javolution:5.5.1" level="project" />
163-
<orderEntry type="library" name="Maven: log4j:apache-log4j-extras:1.2.17" level="project" />
164162
<orderEntry type="library" name="Maven: org.antlr:ST4:4.0.4" level="project" />
165163
<orderEntry type="library" name="Maven: com.googlecode.javaewah:JavaEWAH:0.3.2" level="project" />
166164
<orderEntry type="library" name="Maven: org.iq80.snappy:snappy:0.2" level="project" />
@@ -202,6 +200,9 @@
202200
<orderEntry type="library" name="Maven: com.github.spotbugs:spotbugs-annotations:3.1.12" level="project" />
203201
<orderEntry type="library" name="Maven: com.datastax.oss:java-driver-mapper-runtime:4.10.0" level="project" />
204202
<orderEntry type="library" name="Maven: com.datastax.oss:java-driver-query-builder:4.10.0" level="project" />
203+
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.19.0" level="project" />
204+
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.19.0" level="project" />
205+
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.19.0" level="project" />
205206
<orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest_2.11:3.2.12" level="project" />
206207
<orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest-core_2.11:3.2.12" level="project" />
207208
<orderEntry type="library" scope="TEST" name="Maven: org.scalatest:scalatest-compatible:3.2.12" level="project" />

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
<groupId>datastax.astra.migrate</groupId>
55
<artifactId>cassandra-data-migrator</artifactId>
6-
<version>1.3</version>
6+
<version>1.6</version>
77
<packaging>jar</packaging>
88

99
<properties>

src/main/java/datastax/astra/migrate/AbstractJobSession.java

Lines changed: 79 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
public class AbstractJobSession extends BaseJobSession {
1818

19+
public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
1920

2021
protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) {
2122
this.sourceSession = sourceSession;
@@ -36,25 +37,26 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
3637

3738
isPreserveTTLWritetime = Boolean.parseBoolean(sparkConf.get("spark.preserveTTLWriteTime", "false"));
3839
if (isPreserveTTLWritetime) {
39-
String ttlColsStr = sparkConf.get("spark.source.ttl.cols");
40+
String ttlColsStr = sparkConf.get("spark.preserveTTLWriteTime.ttl.cols");
4041
if (null != ttlColsStr && ttlColsStr.trim().length() > 0) {
4142
for (String ttlCol : ttlColsStr.split(",")) {
4243
ttlCols.add(Integer.parseInt(ttlCol));
4344
}
4445
}
46+
47+
String writeTimestampColsStr = sparkConf.get("spark.preserveTTLWriteTime.writetime.cols");
48+
if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
49+
for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
50+
writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
51+
}
52+
}
4553
}
4654

4755
writeTimeStampFilter = Boolean
4856
.parseBoolean(sparkConf.get("spark.source.writeTimeStampFilter", "false"));
4957
// batchsize set to 1 if there is a writeFilter
5058
if (writeTimeStampFilter) {
5159
batchSize = 1;
52-
String writeTimestampColsStr = sparkConf.get("spark.source.writeTimeStampFilter.cols");
53-
if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) {
54-
for (String writeTimeStampCol : writeTimestampColsStr.split(",")) {
55-
writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol));
56-
}
57-
}
5860
}
5961

6062
String minWriteTimeStampFilterStr =
@@ -74,45 +76,82 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
7476
customWritetime = Long.parseLong(customWriteTimeStr);
7577
}
7678

77-
logger.info(" DEFAULT -- Write Batch Size: " + batchSize);
78-
logger.info(" DEFAULT -- Source Keyspace Table: " + sourceKeyspaceTable);
79-
logger.info(" DEFAULT -- Destination Keyspace Table: " + astraKeyspaceTable);
80-
logger.info(" DEFAULT -- ReadRateLimit: " + readLimiter.getRate());
81-
logger.info(" DEFAULT -- WriteRateLimit: " + writeLimiter.getRate());
82-
logger.info(" DEFAULT -- WriteTimestampFilter: " + writeTimeStampFilter);
83-
logger.info(" DEFAULT -- WriteTimestampFilterCols: " + writeTimeStampCols);
84-
logger.info(" DEFAULT -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
85-
logger.info(" DEFAULT -- TTLCols: " + ttlCols);
79+
logger.info("PARAM -- Write Batch Size: " + batchSize);
80+
logger.info("PARAM -- Source Keyspace Table: " + sourceKeyspaceTable);
81+
logger.info("PARAM -- Destination Keyspace Table: " + astraKeyspaceTable);
82+
logger.info("PARAM -- ReadRateLimit: " + readLimiter.getRate());
83+
logger.info("PARAM -- WriteRateLimit: " + writeLimiter.getRate());
84+
logger.info("PARAM -- WriteTimestampFilter: " + writeTimeStampFilter);
85+
logger.info("PARAM -- WriteTimestampFilterCols: " + writeTimeStampCols);
86+
logger.info("PARAM -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
87+
logger.info("PARAM -- isPreserveTTLWritetime: " + isPreserveTTLWritetime);
88+
logger.info("PARAM -- TTLCols: " + ttlCols);
89+
90+
String selectCols = sparkConf.get("spark.query.source");
91+
String partionKey = sparkConf.get("spark.query.source.partitionKey");
92+
String sourceSelectCondition = sparkConf.get("spark.query.condition", "");
93+
94+
final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
95+
if (isPreserveTTLWritetime) {
96+
String[] allCols = selectCols.split(",");
97+
ttlCols.forEach(col -> {
98+
selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")");
99+
});
100+
writeTimeStampCols.forEach(col -> {
101+
selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")");
102+
});
103+
}
104+
String fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols.toString() + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
105+
+ ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING";
106+
sourceSelectStatement = sourceSession.prepare(fullSelectQuery);
107+
logger.info("PARAM -- Query used: " + fullSelectQuery);
108+
109+
selectColTypes = getTypes(sparkConf.get("spark.query.types"));
110+
String idCols = sparkConf.get("spark.query.destination.id", "");
111+
idColTypes = selectColTypes.subList(0, idCols.split(",").length);
112+
113+
String insertCols = sparkConf.get("spark.query.destination", "");
114+
if (null == insertCols || insertCols.trim().isEmpty()) {
115+
insertCols = selectCols;
116+
}
117+
String insertBinds = "";
118+
for (String str : idCols.split(",")) {
119+
if (insertBinds.isEmpty()) {
120+
insertBinds = str + "= ?";
121+
} else {
122+
insertBinds += " and " + str + "= ?";
123+
}
124+
}
125+
astraSelectStatement = astraSession.prepare(
126+
"select " + insertCols + " from " + astraKeyspaceTable
127+
+ " where " + insertBinds);
86128

87129
hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.source.hasRandomPartitioner", "false"));
88-
89130
isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.counterTable", "false"));
90-
selectColTypes = getTypes(sparkConf.get("spark.diff.select.types"));
91-
String partionKey = sparkConf.get("spark.query.cols.partitionKey");
92-
String idCols = sparkConf.get("spark.query.cols.id");
93-
idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
131+
if (isCounterTable) {
132+
String updateSelectMappingStr = sparkConf.get("spark.counterTable.cql.index", "0");
133+
for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
134+
updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
135+
}
94136

95-
String selectCols = sparkConf.get("spark.query.cols.select");
137+
String counterTableUpdate = sparkConf.get("spark.counterTable.cql");
138+
astraInsertStatement = astraSession.prepare(counterTableUpdate);
139+
} else {
140+
insertBinds = "";
141+
for (String str : insertCols.split(",")) {
142+
if (insertBinds.isEmpty()) {
143+
insertBinds += "?";
144+
} else {
145+
insertBinds += ", ?";
146+
}
147+
}
96148

97-
String idBinds = "";
98-
int count = 1;
99-
for (String str : idCols.split(",")) {
100-
if (count > 1) {
101-
idBinds = idBinds + " and " + str + "= ?";
149+
if (isPreserveTTLWritetime) {
150+
astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ") using TTL ? and TIMESTAMP ?");
102151
} else {
103-
idBinds = str + "= ?";
152+
astraInsertStatement = astraSession.prepare("insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")");
104153
}
105-
count++;
106154
}
107-
108-
sourceSelectCondition = sparkConf.get("spark.query.cols.select.condition", "");
109-
sourceSelectStatement = sourceSession.prepare(
110-
"select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
111-
+ ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING");
112-
113-
astraSelectStatement = astraSession.prepare(
114-
"select " + selectCols + " from " + astraKeyspaceTable
115-
+ " where " + idBinds);
116155
}
117156

118157
public List<MigrateDataType> getTypes(String types) {
@@ -127,15 +166,15 @@ public List<MigrateDataType> getTypes(String types) {
127166
public int getLargestTTL(Row sourceRow) {
128167
int ttl = 0;
129168
for (Integer ttlCol : ttlCols) {
130-
ttl = Math.max(ttl, sourceRow.getInt(ttlCol));
169+
ttl = Math.max(ttl, sourceRow.getInt(selectColTypes.size() + ttlCol - 1));
131170
}
132171
return ttl;
133172
}
134173

135174
public long getLargestWriteTimeStamp(Row sourceRow) {
136175
long writeTimestamp = 0;
137176
for (Integer writeTimeStampCol : writeTimeStampCols) {
138-
writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(writeTimeStampCol));
177+
writeTimestamp = Math.max(writeTimestamp, sourceRow.getLong(selectColTypes.size() + ttlCols.size() + writeTimeStampCol - 1));
139178
}
140179
return writeTimestamp;
141180
}

src/main/java/datastax/astra/migrate/BaseJobSession.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,15 @@
33
import com.datastax.oss.driver.api.core.CqlSession;
44
import com.datastax.oss.driver.api.core.cql.PreparedStatement;
55
import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
6-
import org.slf4j.Logger;
7-
import org.slf4j.LoggerFactory;
86

97
import java.util.ArrayList;
108
import java.util.List;
119

12-
public abstract class BaseJobSession {
13-
14-
public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
10+
public abstract class BaseJobSession {
1511

1612
protected PreparedStatement sourceSelectStatement;
17-
protected String sourceSelectCondition;
18-
1913
protected PreparedStatement astraSelectStatement;
14+
protected PreparedStatement astraInsertStatement;
2015

2116
// Read/Write Rate limiter
2217
// Determine the total throughput for the entire cluster in terms of wries/sec,
@@ -32,6 +27,7 @@ public abstract class BaseJobSession {
3227
protected CqlSession astraSession;
3328
protected List<MigrateDataType> selectColTypes = new ArrayList<MigrateDataType>();
3429
protected List<MigrateDataType> idColTypes = new ArrayList<MigrateDataType>();
30+
protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
3531

3632
protected Integer batchSize = 1;
3733
protected Integer printStatsAfter = 100000;

0 commit comments

Comments
 (0)