#795 Improve the syntax of generating raw record RDDs.

yruslan · yruslan · commit 98751ef52f35 · 2025-11-13T10:11:21.000+01:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -222,7 +222,7 @@ object CobolParametersParser extends Logging {
     policy
   }
 
-  def parse(params: Parameters): CobolParameters = {
+  def parse(params: Parameters, validateRedundantOptions: Boolean = true): CobolParameters = {
     val schemaRetentionPolicy = getSchemaRetentionPolicy(params)
     val stringTrimmingPolicy = getStringTrimmingPolicy(params)
     val ebcdicCodePageName = params.getOrElse(PARAM_EBCDIC_CODE_PAGE, "common")
@@ -306,7 +306,7 @@ object CobolParametersParser extends Logging {
       MetadataPolicy(params.getOrElse(PARAM_METADATA, "basic")),
       params.getMap
       )
-    validateSparkCobolOptions(params, recordFormat)
+    validateSparkCobolOptions(params, recordFormat, validateRedundantOptions)
     cobolParameters
   }
 
@@ -753,7 +753,7 @@ object CobolParametersParser extends Logging {
     *
     * @param params Parameters provided by spark.read.option(...)
     */
-  private def validateSparkCobolOptions(params: Parameters, recordFormat: RecordFormat): Unit = {
+  private def validateSparkCobolOptions(params: Parameters, recordFormat: RecordFormat, validateRedundantOptions: Boolean): Unit = {
     val isRecordSequence = params.getOrElse(PARAM_IS_XCOM, "false").toBoolean ||
       params.getOrElse(PARAM_IS_RECORD_SEQUENCE, "false").toBoolean ||
       params.contains(PARAM_FILE_START_OFFSET) ||
@@ -946,7 +946,7 @@ object CobolParametersParser extends Logging {
       params.contains(PARAM_STRICT_INTEGRAL_PRECISION) && params(PARAM_STRICT_INTEGRAL_PRECISION).toBoolean)
       throw new IllegalArgumentException(s"Options '$PARAM_DISPLAY_PIC_ALWAYS_STRING' and '$PARAM_STRICT_INTEGRAL_PRECISION' cannot be used together.")
 
-    if (unusedKeys.nonEmpty) {
+    if (validateRedundantOptions && unusedKeys.nonEmpty) {
       val unusedKeyStr = unusedKeys.mkString(",")
       val msg = s"Redundant or unrecognized option(s) to 'spark-cobol': $unusedKeyStr."
       if (isPedantic) {
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -20,12 +20,14 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.slf4j.LoggerFactory
+import za.co.absa.cobrix.cobol.parser.Copybook
 import za.co.absa.cobrix.cobol.processor.impl.CobolProcessorBase
 import za.co.absa.cobrix.cobol.processor.{CobolProcessingStrategy, CobolProcessor, SerializableRawRecordProcessor}
 import za.co.absa.cobrix.cobol.reader.common.Constants
 import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
 import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser.PARAM_GENERATE_RECORD_ID
 import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
+import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 import za.co.absa.cobrix.spark.cobol.reader.VarLenReader
 import za.co.absa.cobrix.spark.cobol.source.index.IndexBuilder
 import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
@@ -73,10 +75,6 @@ object SparkCobolProcessor {
         throw new IllegalArgumentException("Copybook contents must be provided.")
       }
 
-      if (rawRecordProcessorOpt.isEmpty) {
-        throw new IllegalArgumentException("A RawRecordProcessor must be provided.")
-      }
-
       if (numberOfThreads < 1) {
         throw new IllegalArgumentException("Number of threads must be at least 1.")
       }
@@ -85,23 +83,7 @@ object SparkCobolProcessor {
         throw new IllegalArgumentException("At least one input file must be provided.")
       }
 
-      new SparkCobolProcessorLoader(filePaths, copybookContentsOpt.get, rawRecordProcessorOpt.get, cobolProcessingStrategy, numberOfThreads, caseInsensitiveOptions.toMap)
-    }
-
-    def toRDD(path: String): RDD[Array[Byte]] = {
-      val filePaths = FileUtils
-        .getFiles(path, spark.sparkContext.hadoopConfiguration)
-
-      toRDD(filePaths)
-    }
-
-    def toRDD(filePaths: Seq[String]): RDD[Array[Byte]] = {
-      if (copybookContentsOpt.isEmpty) {
-        throw new IllegalArgumentException("Copybook contents must be provided.")
-      }
-
-      val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
-      getRecordRdd(filePaths, copybookContentsOpt.get, caseInsensitiveOptions.toMap, sconf)
+      new SparkCobolProcessorLoader(filePaths, copybookContentsOpt.get, rawRecordProcessorOpt, cobolProcessingStrategy, numberOfThreads, caseInsensitiveOptions.toMap)
     }
 
     def withCopybookContents(copybookContents: String): SparkCobolProcessorBuilder = {
@@ -154,12 +136,16 @@ object SparkCobolProcessor {
 
   class SparkCobolProcessorLoader(filesToRead: Seq[String],
                                   copybookContents: String,
-                                  rawRecordProcessor: SerializableRawRecordProcessor,
+                                  rawRecordProcessorOpt: Option[SerializableRawRecordProcessor],
                                   cobolProcessingStrategy: CobolProcessingStrategy,
                                   numberOfThreads: Int,
                                   options: Map[String, String])
                                  (implicit spark: SparkSession) {
     def save(outputPath: String): Long = {
+      if (rawRecordProcessorOpt.isEmpty) {
+        throw new IllegalArgumentException("A RawRecordProcessor must be provided.")
+      }
+
       val cobolProcessor = CobolProcessor.builder
         .withCopybookContents(copybookContents)
         .withProcessingStrategy(cobolProcessingStrategy)
@@ -170,14 +156,25 @@ object SparkCobolProcessor {
         private val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
 
         override def process(listOfFiles: Seq[String], outputPath: String): Long = {
-          getFileProcessorRdd(listOfFiles, outputPath, copybookContents, cobolProcessor, rawRecordProcessor, sconf, numberOfThreads)
+          getFileProcessorRdd(listOfFiles, outputPath, cobolProcessor, rawRecordProcessorOpt.get, sconf, numberOfThreads)
             .reduce(_ + _)
         }
       }
 
       log.info(s"Writing to $outputPath...")
       processor.process(filesToRead, outputPath)
     }
+
+    def getParsedCopybook: Copybook = {
+      val cobolParameters = getCobolParameters(filesToRead, copybookContents, options, ignoreRedundantOptions = true)
+      val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
+      CobolSchema.fromReaderParameters(Seq(copybookContents), readerParameters).copybook
+    }
+
+    def toRDD: RDD[Array[Byte]] = {
+      val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
+      getRecordRdd(filesToRead, copybookContents, options, sconf)
+    }
   }
 
   def builder(implicit spark: SparkSession): SparkCobolProcessorBuilder = {
@@ -186,7 +183,6 @@ object SparkCobolProcessor {
 
   private def getFileProcessorRdd(listOfFiles: Seq[String],
                                   outputPath: String,
-                                  copybookContents: String,
                                   cobolProcessor: CobolProcessor,
                                   rawRecordProcessor: SerializableRawRecordProcessor,
                                   sconf: SerializableConfiguration,
@@ -195,19 +191,22 @@ object SparkCobolProcessor {
     val groupedFiles = listOfFiles.grouped(numberOfThreads).toSeq
     val rdd = spark.sparkContext.parallelize(groupedFiles)
     rdd.map(group => {
-      processListOfFiles(group, outputPath, copybookContents, cobolProcessor, rawRecordProcessor, sconf, numberOfThreads)
+      processListOfFiles(group, outputPath, cobolProcessor, rawRecordProcessor, sconf, numberOfThreads)
     })
   }
 
+  private def getCobolParameters(listOfFiles: Seq[String], copybookContents: String, options: Map[String, String], ignoreRedundantOptions: Boolean): CobolParameters = {
+    val varLenOptions = options + (PARAM_GENERATE_RECORD_ID -> "true")
+
+    CobolParametersParser.parse(new Parameters(varLenOptions), !ignoreRedundantOptions)
+      .copy(sourcePaths = listOfFiles, copybookContent = Option(copybookContents))
+  }
+
   private def getRecordRdd(listOfFiles: Seq[String],
                            copybookContents: String,
                            options: Map[String, String],
                            sconf: SerializableConfiguration)(implicit spark: SparkSession): RDD[Array[Byte]] = {
-
-    val varLenOptions = options + (PARAM_GENERATE_RECORD_ID -> "true")
-
-    val cobolParameters: CobolParameters = CobolParametersParser.parse(new Parameters(varLenOptions))
-      .copy(sourcePaths = listOfFiles, copybookContent = Option(copybookContents))
+    val cobolParameters = getCobolParameters(listOfFiles, copybookContents, options, ignoreRedundantOptions = false)
 
     val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
     val cobolReader = DefaultSource.createVariableLengthReader(cobolParameters, spark)
@@ -248,7 +247,6 @@ object SparkCobolProcessor {
 
   private def processListOfFiles(listOfFiles: Seq[String],
                                  outputPath: String,
-                                 copybookContents: String,
                                  cobolProcessor: CobolProcessor,
                                  rawRecordProcessor: SerializableRawRecordProcessor,
                                  sconf: SerializableConfiguration,
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala
@@ -47,6 +47,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
       val exception = intercept[IllegalArgumentException] {
         SparkCobolProcessor.builder
           .withCopybookContents(copybook).load(".")
+          .save("ignored")
       }
 
       assert(exception.getMessage.contains("A RawRecordProcessor must be provided."))
@@ -159,10 +160,13 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
 
       writeBinaryFile(inputPath, binData)
 
-      val rdd = SparkCobolProcessor.builder
+      val rddBuilder = SparkCobolProcessor.builder
         .withCopybookContents(copybook)
         .option("enable_indexes", "false")
-        .toRDD(inputPath)
+        .load(inputPath)
+
+      val parsedCopybook = rddBuilder.getParsedCopybook
+      val rdd = rddBuilder.toRDD
 
       val count = rdd.count()
 
@@ -174,6 +178,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
         .sortBy(x => x)
         .collect().mkString(", ")
 
+      assert(parsedCopybook.ast.children.length == 1)
       assert(actual == expected)
     }
   }
@@ -192,7 +197,8 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
         .withCopybookContents(copybook)
         .option("enable_indexes", "true")
         .option("input_split_records", "2")
-        .toRDD(inputPath)
+        .load(inputPath)
+        .toRDD
 
       val count = rdd.count()