#788 Change the SparkCobolProcessor logic to be based on the builder pattern.

yruslan · yruslan · commit c530c684ef04 · 2025-10-09T14:19:13.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/CobolProcessorContext.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/CobolProcessorContext.scala
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.processor
+
+import za.co.absa.cobrix.cobol.parser.Copybook
+
+case class CobolProcessorContext(copybook: Copybook,
+                                 options: Map[String, String],
+                                 currentOffset: Long)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/RawRecordProcessor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/RawRecordProcessor.scala
@@ -16,16 +16,10 @@
 
 package za.co.absa.cobrix.cobol.processor
 
-import za.co.absa.cobrix.cobol.parser.Copybook
-
 /**
  * A trait that defines a processor for raw COBOL records.
  * It provides a method to process a single COBOL record based on the provided copybook and options.
  */
 trait RawRecordProcessor {
-  def processRecord(copybook: Copybook,
-                    options: Map[String, String],
-                    record: Array[Byte],
-                    offset: Long): Array[Byte]
-
+  def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte]
 }
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/StreamProcessor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/StreamProcessor.scala
@@ -17,7 +17,7 @@
 package za.co.absa.cobrix.cobol.processor.impl
 
 import za.co.absa.cobrix.cobol.parser.Copybook
-import za.co.absa.cobrix.cobol.processor.RawRecordProcessor
+import za.co.absa.cobrix.cobol.processor.{CobolProcessorContext, RawRecordProcessor}
 import za.co.absa.cobrix.cobol.reader.extractors.raw.RawRecordExtractor
 import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
 
@@ -48,7 +48,9 @@ object StreamProcessor {
       val record = recordExtractor.next()
       val recordSize = record.length
 
-      val updatedRecord = recordProcessor.processRecord(copybook, options, record, recordExtractor.offset)
+      val ctx = CobolProcessorContext(copybook, options, recordExtractor.offset)
+
+      val updatedRecord = recordProcessor.processRecord(record, ctx)
 
       val headerSize = recordExtractor.offset - recordSize - inputStream.offset
       if (headerSize > 0) {
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/CobolProcessorBuilderSuite.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/CobolProcessorBuilderSuite.scala
@@ -38,7 +38,7 @@ class CobolProcessorBuilderSuite extends AnyWordSpec {
       val builder = CobolProcessor.builder(copybook)
 
       val processor = new RawRecordProcessor {
-        override def processRecord(copybook: Copybook, options: Map[String, String], record: Array[Byte], offset: Long): Array[Byte] = {
+        override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {
           record.map(v => (v - 1).toByte)
         }
       }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -23,6 +23,7 @@ import org.slf4j.LoggerFactory
 import za.co.absa.cobrix.cobol.processor.{CobolProcessor, SerializableRawRecordProcessor}
 import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
 import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
+import za.co.absa.cobrix.spark.cobol.utils.FileUtils
 
 import java.io.BufferedOutputStream
 import java.util.concurrent.{ExecutorService, Executors}
@@ -50,7 +51,14 @@ object SparkCobolProcessor {
     private var rawRecordProcessorOpt: Option[SerializableRawRecordProcessor] = None
     private var numberOfThreads: Int = 1
 
-    def build(): SparkCobolProcessor = {
+    def load(path: String): SparkCobolProcessorLoader = {
+      val filePaths = FileUtils
+        .getFiles(path, spark.sparkContext.hadoopConfiguration)
+
+      load(filePaths)
+    }
+
+    def load(filePaths: Seq[String]): SparkCobolProcessorLoader = {
       if (copybookContentsOpt.isEmpty) {
         throw new IllegalArgumentException("Copybook contents must be provided.")
       }
@@ -63,18 +71,11 @@ object SparkCobolProcessor {
         throw new IllegalArgumentException("Number of threads must be at least 1.")
       }
 
-      val cobolProcessor = CobolProcessor.builder(copybookContentsOpt.get)
-        .options(caseInsensitiveOptions.toMap)
-        .build()
-
-      new SparkCobolProcessor {
-        private val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
-
-        override def process(listOfFiles: Seq[String], outputPath: String): Long = {
-          getFileProcessorRdd(listOfFiles, outputPath, copybookContentsOpt.get, cobolProcessor, rawRecordProcessorOpt.get, sconf, numberOfThreads)
-            .reduce(_ + _)
-        }
+      if (filePaths.isEmpty) {
+        throw new IllegalArgumentException("At least one input file must be provided.")
       }
+
+      new SparkCobolProcessorLoader(filePaths, copybookContentsOpt.get, rawRecordProcessorOpt.get, numberOfThreads, caseInsensitiveOptions.toMap)
     }
 
     def withCopybookContents(copybookContents: String): SparkCobolProcessorBuilder = {
@@ -118,9 +119,31 @@ object SparkCobolProcessor {
       caseInsensitiveOptions ++= options.map(kv => (kv._1.toLowerCase(), kv._2))
       this
     }
-
   }
 
+  class SparkCobolProcessorLoader(filesToRead: Seq[String],
+                                  copybookContents: String,
+                                  rawRecordProcessor: SerializableRawRecordProcessor,
+                                  numberOfThreads: Int,
+                                  options: Map[String, String])
+                                 (implicit spark: SparkSession) {
+    def save(outputPath: String): Long = {
+      val cobolProcessor = CobolProcessor.builder(copybookContents)
+        .options(options)
+        .build()
+
+      val processor = new SparkCobolProcessor {
+        private val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
+
+        override def process(listOfFiles: Seq[String], outputPath: String): Long = {
+          getFileProcessorRdd(listOfFiles, outputPath, copybookContents, cobolProcessor, rawRecordProcessor, sconf, numberOfThreads)
+            .reduce(_ + _)
+        }
+      }
+
+      processor.process(filesToRead, outputPath)
+    }
+  }
 
   def builder(implicit spark: SparkSession): SparkCobolProcessorBuilder = {
     new SparkCobolProcessorBuilder
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala
@@ -18,8 +18,7 @@ package za.co.absa.cobrix.spark.cobol
 
 import org.apache.hadoop.fs.Path
 import org.scalatest.wordspec.AnyWordSpec
-import za.co.absa.cobrix.cobol.parser.Copybook
-import za.co.absa.cobrix.cobol.processor.SerializableRawRecordProcessor
+import za.co.absa.cobrix.cobol.processor.{CobolProcessorContext, SerializableRawRecordProcessor}
 import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
 import za.co.absa.cobrix.spark.cobol.source.fixtures.{BinaryFileFixture, TextComparisonFixture}
 
@@ -30,15 +29,15 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
       |""".stripMargin
 
   private val rawRecordProcessor = new SerializableRawRecordProcessor {
-    override def processRecord(copybook: Copybook, options: Map[String, String], record: Array[Byte], offset: Long): Array[Byte] = {
+    override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {
       record.map(v => (v - 1).toByte)
     }
   }
 
   "SparkCobolProcessor" should {
     "fail to create when a copybook is not specified" in {
       val exception = intercept[IllegalArgumentException] {
-        SparkCobolProcessor.builder.build()
+        SparkCobolProcessor.builder.load(".")
       }
 
       assert(exception.getMessage.contains("Copybook contents must be provided."))
@@ -47,8 +46,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
     "fail to create when a record processor is not provided" in {
       val exception = intercept[IllegalArgumentException] {
         SparkCobolProcessor.builder
-          .withCopybookContents(copybook)
-          .build()
+          .withCopybookContents(copybook).load(".")
       }
 
       assert(exception.getMessage.contains("A RawRecordProcessor must be provided."))
@@ -60,13 +58,24 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
           .withCopybookContents(copybook)
           .withRecordProcessor(rawRecordProcessor)
           .withMultithreaded(0)
-          .build()
+          .load("")
       }
 
       assert(exception.getMessage.contains("Number of threads must be at least 1."))
     }
 
-    "create a processor that processes files via an RDD" in {
+    "fail when no files are provided" in {
+      val exception = intercept[IllegalArgumentException] {
+        SparkCobolProcessor.builder
+          .withCopybookContents(copybook)
+          .withRecordProcessor(rawRecordProcessor)
+          .load(Seq.empty)
+      }
+
+      assert(exception.getMessage.contains("At least one input file must be provided."))
+    }
+
+    "process files via an RDD" in {
       withTempDirectory("spark_cobol_processor") { tempDir =>
         val binData = Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte)
 
@@ -76,12 +85,13 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
 
         writeBinaryFile(inputPath, binData)
 
-        val processor = SparkCobolProcessor.builder
+        SparkCobolProcessor.builder
           .withCopybookContents(copybook)
-          .withRecordProcessor(rawRecordProcessor)
-          .build()
-
-        processor.process(Seq(inputPath), outputPath)
+          .withRecordProcessor { (record: Array[Byte], ctx: CobolProcessorContext) =>
+            record.map(v => (v - 1).toByte)
+          }
+          .load(inputPath)
+          .save(outputPath)
 
         val outputData = readBinaryFile(outputFile)
 

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ class CobolProcessorBuilderSuite extends AnyWordSpec {`
`38`	`38`	`val builder = CobolProcessor.builder(copybook)`
`39`	`39`
`40`	`40`	`val processor = new RawRecordProcessor {`
`41`		`- override def processRecord(copybook: Copybook, options: Map[String, String], record: Array[Byte], offset: Long): Array[Byte] = {`
	`41`	`+ override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {`
`42`	`42`	`record.map(v => (v - 1).toByte)`
`43`	`43`	`}`
`44`	`44`	`}`