Skip to content

Commit 0ce40e3

Browse files
committed
feat!: remove partitionFile and use input output
1 parent a387b33 commit 0ce40e3

File tree

6 files changed

+45
-26
lines changed

6 files changed

+45
-26
lines changed

README.md

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,34 @@ Each line above represents a partition-range (`min,max`). Alternatively, you can
9696
```
9797
./spark-submit --properties-file cdm.properties \
9898
--conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
99-
--conf spark.cdm.tokenrange.partitionFile="/<path-to-file>/<csv-input-filename>" \
99+
--conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
100100
--master "local[*]" --driver-memory 25G --executor-memory 25G \
101101
--class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
102102
```
103103
This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
104104

105-
> **Note:**
106-
> A file named `./<keyspacename>.<tablename>_partitions.csv` is auto generated by the Migration & Validation jobs in the above format containing any failed partition ranges. No file is created if there are no failed partitions. You can use this file as an input to process any failed partition in a following run.
105+
A file named `./<keyspacename>.<tablename>_partitions.csv` is auto-generated by the Migration & Validation jobs in the above format containing any failed partition ranges. No file is created if there are no failed partitions. This file can be used as an input to process any failed partition in a following run. You can also specify a different output file using the `spark.cdm.tokenrange.partitionFile.output` option.
106+
```
107+
./spark-submit --properties-file cdm.properties \
108+
--conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
109+
--conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
110+
--conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
111+
--master "local[*]" --driver-memory 25G --executor-memory 25G \
112+
--class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
113+
```
114+
115+
For the Data-Validation step, use the conf option `-conf spark.cdm.tokenrange.partitionFile.appendOnDiff` as shown below. This allows the partition range to be outputted whenever there are differences, not just fails.
116+
```
117+
./spark-submit --properties-file cdm.properties \
118+
--conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
119+
--conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
120+
--conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
121+
--conf spark.cdm.tokenrange.partitionFile.appendOnDiff=true \
122+
--master "local[*]" --driver-memory 25G --executor-memory 25G \
123+
--class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
124+
```
125+
126+
If `spark.cdm.tokenrange.partitionFile.input` or `spark.cdm.tokenrange.partitionFile.output` are not specified, the system will use `./<keyspacename>.<tablename>_partitions.csv` as the default file.
107127

108128
# Perform large-field Guardrail violation checks
109129
- The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field) `--class com.datastax.cdm.job.GuardrailCheck` as shown below

SIT/features/06_partition_range/migrate_with_partitionfile.properties

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,7 @@ spark.cdm.perfops.numParts 1
1818
spark.cdm.autocorrect.missing true
1919
spark.cdm.autocorrect.mismatch true
2020

21-
spark.cdm.tokenrange.partitionFile ./partitions.csv
21+
spark.cdm.tokenrange.partitionFile.input ./partitions_input.csv
22+
spark.cdm.tokenrange.partitionFile.output ./partitions_output.csv
23+
spark.cdm.tokenrange.partitionFile.appendOnDiff true
2224

src/main/java/com/datastax/cdm/job/SplitPartitions.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,6 @@ public static String getPartitionFileInput(PropertyHelper propertyHelper) {
177177
return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_INPUT);
178178
}
179179

180-
if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {
181-
return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);
182-
}
183-
184180
return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
185181
}
186182

@@ -189,10 +185,6 @@ public static String getPartitionFileOutput(PropertyHelper propertyHelper) {
189185
return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_OUTPUT);
190186
}
191187

192-
if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {
193-
return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);
194-
}
195-
196188
return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
197189
}
198190

src/main/java/com/datastax/cdm/properties/KnownProperties.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,10 @@ public enum PropertyType {
154154
//==========================================================================
155155
// Partition File
156156
//==========================================================================
157-
public static final String TOKEN_RANGE_PARTITION_FILE = "spark.cdm.tokenrange.partitionFile";
158157
public static final String TOKEN_RANGE_PARTITION_FILE_APPEND_ON_DIFF = "spark.cdm.tokenrange.partitionFile.appendOnDiff";
159158
public static final String TOKEN_RANGE_PARTITION_FILE_INPUT = "spark.cdm.tokenrange.partitionFile.input";
160159
public static final String TOKEN_RANGE_PARTITION_FILE_OUTPUT = "spark.cdm.tokenrange.partitionFile.output";
161160
static {
162-
types.put(TOKEN_RANGE_PARTITION_FILE, PropertyType.STRING);
163161
types.put(TOKEN_RANGE_PARTITION_FILE_APPEND_ON_DIFF, PropertyType.BOOLEAN);
164162
types.put(TOKEN_RANGE_PARTITION_FILE_INPUT, PropertyType.STRING);
165163
types.put(TOKEN_RANGE_PARTITION_FILE_OUTPUT, PropertyType.STRING);

src/resources/cdm-detailed.properties

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,17 +151,30 @@ spark.cdm.schema.origin.keyspaceTable keyspace_name.table_name
151151
# 5323. The corresponding counter in Origin is also 5323. At some point, the Target
152152
# counter gets DELETEd. Should the .missing record be re-inserted before
153153
# the DELETE gets tombstoned, the counter will zombie back to life, and the
154-
# counter will become 5323+5323 = 10646.
154+
# counter will become 5323+5323 = 10646.
155+
156+
# spark.cdm.tokenrange
157+
# .partitionFile
158+
# .input : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
159+
# input when applicable. If the file exists, only the partition ranges
160+
# in this file will be Migrated or Validated.
161+
# spark.cdm.tokenrange
162+
# .partitionFile
163+
# .output : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
164+
# output when applicable. If exceptions occur during Migrating or Validation,
165+
# or if `spark.cdm.tokenrange.partitionFile.appendOnDiff` is set to true,
166+
# partition ranges with exceptions will be logged to this file.
155167
# spark.cdm.tokenrange
156-
# .partitionFile : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
157-
# input as well as output when applicable. If the file exists, only the partition ranges
158-
# in this file will be Migrated or Validated. Similarly, if exceptions occur during
159-
# Migrating or Validation, partition ranges with exceptions will be logged to this file.
168+
# .partitionFile
169+
# .appendOnDiff : Default is false. If it is set to true, the partition range would be outputted
170+
# to `spark.cdm.tokenrange.partitionFile.output` if there are any differences.
160171
#-----------------------------------------------------------------------------------------------------------
161172
spark.cdm.autocorrect.missing false
162173
spark.cdm.autocorrect.mismatch false
163174
#spark.cdm.autocorrect.missing.counter false
164-
#spark.cdm.tokenrange.partitionFile /tokenrange/exception/path/keyspace.tablename_partitions.csv
175+
#spark.cdm.tokenrange.partitionFile.input /tokenrange/path/input/keyspace.tablename_partitions.csv
176+
#spark.cdm.tokenrange.partitionFile.output /tokenrange/path/output/keyspace.tablename_partitions.csv
177+
#spark.cdm.tokenrange.partitionFile.appendOnDiff false
165178

166179
#===========================================================================================================
167180
# Performance and Operations Parameters affecting throughput, error handling, and similar concerns.

src/test/java/com/datastax/cdm/job/SplitPartitionsTest.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,6 @@ void getPartitionFileInput() {
113113
helper.setProperty("spark.cdm.schema.origin.keyspaceTable", "tb");
114114
assertEquals("./tb_partitions.csv", SplitPartitions.getPartitionFileInput(helper));
115115

116-
helper.setProperty("spark.cdm.tokenrange.partitionFile", "./file.csv");
117-
assertEquals("./file.csv", SplitPartitions.getPartitionFileInput(helper));
118-
119116
helper.setProperty("spark.cdm.tokenrange.partitionFile.input", "./file_input.csv");
120117
assertEquals("./file_input.csv", SplitPartitions.getPartitionFileInput(helper));
121118
}
@@ -126,9 +123,6 @@ void getPartitionFileOutput() {
126123
helper.setProperty("spark.cdm.schema.origin.keyspaceTable", "tb");
127124
assertEquals("./tb_partitions.csv", SplitPartitions.getPartitionFileOutput(helper));
128125

129-
helper.setProperty("spark.cdm.tokenrange.partitionFile", "./file.csv");
130-
assertEquals("./file.csv", SplitPartitions.getPartitionFileOutput(helper));
131-
132126
helper.setProperty("spark.cdm.tokenrange.partitionFile.output", "./file_output.csv");
133127
assertEquals("./file_output.csv", SplitPartitions.getPartitionFileOutput(helper));
134128
}

0 commit comments

Comments
 (0)