feat!: remove partitionFile and use input output

guofei · guofei · commit 0ce40e341a14 · 2024-05-23T11:12:11.000+09:00
diff --git a/README.md b/README.md
@@ -96,14 +96,34 @@ Each line above represents a partition-range (`min,max`). Alternatively, you can
 ```
 ./spark-submit --properties-file cdm.properties \
  --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
- --conf spark.cdm.tokenrange.partitionFile="/<path-to-file>/<csv-input-filename>" \
+ --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
  --master "local[*]" --driver-memory 25G --executor-memory 25G \
  --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
 ```
 This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run.
 
-> **Note:**
-> A file named `./<keyspacename>.<tablename>_partitions.csv` is auto generated by the Migration & Validation jobs in the above format containing any failed partition ranges. No file is created if there are no failed partitions. You can use this file as an input to process any failed partition in a following run.
+A file named `./<keyspacename>.<tablename>_partitions.csv` is auto-generated by the Migration & Validation jobs in the above format containing any failed partition ranges. No file is created if there are no failed partitions. This file can be used as an input to process any failed partition in a following run. You can also specify a different output file using the `spark.cdm.tokenrange.partitionFile.output` option.
+```
+./spark-submit --properties-file cdm.properties \
+ --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
+ --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
+ --conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
+ --master "local[*]" --driver-memory 25G --executor-memory 25G \
+ --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
+```
+
+For the Data-Validation step, use the conf option `-conf spark.cdm.tokenrange.partitionFile.appendOnDiff` as shown below. This allows the partition range to be outputted whenever there are differences, not just fails.
+```
+./spark-submit --properties-file cdm.properties \
+ --conf spark.cdm.schema.origin.keyspaceTable="<keyspacename>.<tablename>" \
+ --conf spark.cdm.tokenrange.partitionFile.input="/<path-to-file>/<csv-input-filename>" \
+ --conf spark.cdm.tokenrange.partitionFile.output="/<path-to-file>/<csv-output-filename>" \
+ --conf spark.cdm.tokenrange.partitionFile.appendOnDiff=true \
+ --master "local[*]" --driver-memory 25G --executor-memory 25G \
+ --class com.datastax.cdm.job.<Migrate|DiffData> cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt
+```
+
+If `spark.cdm.tokenrange.partitionFile.input` or `spark.cdm.tokenrange.partitionFile.output` are not specified, the system will use `./<keyspacename>.<tablename>_partitions.csv` as the default file.
 
 # Perform large-field Guardrail violation checks
 - The tool can be used to identify large fields from a table that may break you cluster guardrails (e.g. AstraDB has a 10MB limit for a single large field)  `--class com.datastax.cdm.job.GuardrailCheck` as shown below
diff --git a/SIT/features/06_partition_range/migrate_with_partitionfile.properties b/SIT/features/06_partition_range/migrate_with_partitionfile.properties
@@ -18,5 +18,7 @@ spark.cdm.perfops.numParts                        1
 spark.cdm.autocorrect.missing                     true
 spark.cdm.autocorrect.mismatch                    true
 
-spark.cdm.tokenrange.partitionFile                    ./partitions.csv
+spark.cdm.tokenrange.partitionFile.input                    ./partitions_input.csv
+spark.cdm.tokenrange.partitionFile.output                   ./partitions_output.csv
+spark.cdm.tokenrange.partitionFile.appendOnDiff              true
 
diff --git a/src/main/java/com/datastax/cdm/job/SplitPartitions.java b/src/main/java/com/datastax/cdm/job/SplitPartitions.java
@@ -177,10 +177,6 @@ public static String getPartitionFileInput(PropertyHelper propertyHelper) {
             return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_INPUT);
         }
 
-        if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {
-            return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);
-        }
-
         return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
     }
 
@@ -189,10 +185,6 @@ public static String getPartitionFileOutput(PropertyHelper propertyHelper) {
             return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_OUTPUT);
         }
 
-        if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {
-            return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);
-        }
-
         return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";
     }
 
diff --git a/src/main/java/com/datastax/cdm/properties/KnownProperties.java b/src/main/java/com/datastax/cdm/properties/KnownProperties.java
@@ -154,12 +154,10 @@ public enum PropertyType {
     //==========================================================================
     // Partition File
     //==========================================================================
-    public static final String TOKEN_RANGE_PARTITION_FILE = "spark.cdm.tokenrange.partitionFile";
     public static final String TOKEN_RANGE_PARTITION_FILE_APPEND_ON_DIFF = "spark.cdm.tokenrange.partitionFile.appendOnDiff";
     public static final String TOKEN_RANGE_PARTITION_FILE_INPUT = "spark.cdm.tokenrange.partitionFile.input";
     public static final String TOKEN_RANGE_PARTITION_FILE_OUTPUT = "spark.cdm.tokenrange.partitionFile.output";
     static {
-        types.put(TOKEN_RANGE_PARTITION_FILE, PropertyType.STRING);
         types.put(TOKEN_RANGE_PARTITION_FILE_APPEND_ON_DIFF, PropertyType.BOOLEAN);
         types.put(TOKEN_RANGE_PARTITION_FILE_INPUT, PropertyType.STRING);
         types.put(TOKEN_RANGE_PARTITION_FILE_OUTPUT, PropertyType.STRING);
diff --git a/src/resources/cdm-detailed.properties b/src/resources/cdm-detailed.properties
@@ -151,17 +151,30 @@ spark.cdm.schema.origin.keyspaceTable                keyspace_name.table_name
 #                           5323. The corresponding counter in Origin is also 5323. At some point, the Target
 #                           counter gets DELETEd. Should the .missing record be re-inserted before
 #                           the DELETE gets tombstoned, the counter will zombie back to life, and the
-#                           counter will become 5323+5323 = 10646. 
+#                           counter will become 5323+5323 = 10646.
+
+#  spark.cdm.tokenrange
+#   .partitionFile
+#     .input              : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
+#                           input when applicable. If the file exists, only the partition ranges
+#                           in this file will be Migrated or Validated.
+#  spark.cdm.tokenrange
+#   .partitionFile
+#     .output             : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
+#                           output when applicable. If exceptions occur during Migrating or Validation,
+#                           or if `spark.cdm.tokenrange.partitionFile.appendOnDiff` is set to true,
+#                           partition ranges with exceptions will be logged to this file.
 #  spark.cdm.tokenrange
-#   .partitionFile        : Default is "./<keyspace>.<tablename>_partitions.csv". Note, this file is used as
-#                           input as well as output when applicable. If the file exists, only the partition ranges
-#                           in this file will be Migrated or Validated. Similarly, if exceptions occur during
-#                           Migrating or Validation, partition ranges with exceptions will be logged to this file.
+#   .partitionFile
+#     .appendOnDiff       : Default is false. If it is set to true, the partition range would be outputted
+#                           to `spark.cdm.tokenrange.partitionFile.output` if there are any differences.
 #-----------------------------------------------------------------------------------------------------------
 spark.cdm.autocorrect.missing                     false
 spark.cdm.autocorrect.mismatch                    false
 #spark.cdm.autocorrect.missing.counter             false
-#spark.cdm.tokenrange.partitionFile                /tokenrange/exception/path/keyspace.tablename_partitions.csv
+#spark.cdm.tokenrange.partitionFile.input                /tokenrange/path/input/keyspace.tablename_partitions.csv
+#spark.cdm.tokenrange.partitionFile.output               /tokenrange/path/output/keyspace.tablename_partitions.csv
+#spark.cdm.tokenrange.partitionFile.appendOnDiff         false
 
 #===========================================================================================================
 # Performance and Operations Parameters affecting throughput, error handling, and similar concerns.
diff --git a/src/test/java/com/datastax/cdm/job/SplitPartitionsTest.java b/src/test/java/com/datastax/cdm/job/SplitPartitionsTest.java
@@ -113,9 +113,6 @@ void getPartitionFileInput() {
         helper.setProperty("spark.cdm.schema.origin.keyspaceTable", "tb");
         assertEquals("./tb_partitions.csv", SplitPartitions.getPartitionFileInput(helper));
 
-        helper.setProperty("spark.cdm.tokenrange.partitionFile", "./file.csv");
-        assertEquals("./file.csv", SplitPartitions.getPartitionFileInput(helper));
-
         helper.setProperty("spark.cdm.tokenrange.partitionFile.input", "./file_input.csv");
         assertEquals("./file_input.csv", SplitPartitions.getPartitionFileInput(helper));
     }
@@ -126,9 +123,6 @@ void getPartitionFileOutput() {
         helper.setProperty("spark.cdm.schema.origin.keyspaceTable", "tb");
         assertEquals("./tb_partitions.csv", SplitPartitions.getPartitionFileOutput(helper));
 
-        helper.setProperty("spark.cdm.tokenrange.partitionFile", "./file.csv");
-        assertEquals("./file.csv", SplitPartitions.getPartitionFileOutput(helper));
-
         helper.setProperty("spark.cdm.tokenrange.partitionFile.output", "./file_output.csv");
         assertEquals("./file_output.csv", SplitPartitions.getPartitionFileOutput(helper));
     }

Original file line number	Diff line number	Diff line change
`@@ -177,10 +177,6 @@ public static String getPartitionFileInput(PropertyHelper propertyHelper) {`
`177`	`177`	`return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_INPUT);`
`178`	`178`	`}`
`179`	`179`
`180`		`- if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {`
`181`		`- return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);`
`182`		`- }`
`183`		`-`
`184`	`180`	`return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";`
`185`	`181`	`}`
`186`	`182`
`@@ -189,10 +185,6 @@ public static String getPartitionFileOutput(PropertyHelper propertyHelper) {`
`189`	`185`	`return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE_OUTPUT);`
`190`	`186`	`}`
`191`	`187`
`192`		`- if (!StringUtils.isAllBlank(propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE))) {`
`193`		`- return propertyHelper.getString(KnownProperties.TOKEN_RANGE_PARTITION_FILE);`
`194`		`- }`
`195`		`-`
`196`	`188`	`return "./" + propertyHelper.getString(KnownProperties.ORIGIN_KEYSPACE_TABLE) + "_partitions.csv";`
`197`	`189`	`}`
`198`	`190`