#795 Add a feature to get RDD of raw records for raw EBCDIC processing.

yruslan · yruslan · commit 89520b8cc24d · 2025-11-12T08:47:13.000+01:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBase.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBase.scala
@@ -27,8 +27,10 @@ import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
   *
   * The processing can be done from inside an RDD so this is why it is serializable.
   */
-abstract class CobolProcessorBase extends CobolProcessor with Serializable {
-  private[processor] def getRecordExtractor(readerParameters: ReaderParameters, copybookContents: String, inputStream: SimpleStream): RawRecordExtractor = {
+abstract class CobolProcessorBase extends CobolProcessor with Serializable
+
+object CobolProcessorBase {
+  def getRecordExtractor(readerParameters: ReaderParameters, copybookContents: String, inputStream: SimpleStream): RawRecordExtractor = {
     val dataStream = inputStream.copyStream()
     val headerStream = inputStream.copyStream()
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorInPlace.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorInPlace.scala
@@ -44,7 +44,7 @@ class CobolProcessorInPlace(readerParameters: ReaderParameters,
   override def process(inputStream: SimpleStream,
                        outputStream: OutputStream)
                       (rawRecordProcessor: RawRecordProcessor): Long = {
-    val recordExtractor = getRecordExtractor(readerParameters, copybookContents, inputStream)
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream)
 
     val dataStream = inputStream.copyStream()
     try {
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorToRdw.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorToRdw.scala
@@ -42,7 +42,7 @@ class CobolProcessorToRdw(readerParameters: ReaderParameters,
   override def process(inputStream: SimpleStream,
                        outputStream: OutputStream)
                       (rawRecordProcessor: RawRecordProcessor): Long = {
-    val recordExtractor = getRecordExtractor(readerParameters, copybookContents, inputStream)
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream)
 
     StreamProcessor.processStreamToRdw(copybook,
       options,
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBaseSuite.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBaseSuite.scala
@@ -32,11 +32,8 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
   "getRecordExtractor" should {
     "work for an fixed-record-length files" in {
       val stream = new ByteStreamMock(Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte))
-      val processor = CobolProcessor.builder
-        .withCopybookContents(copybook)
-        .build().asInstanceOf[CobolProcessorInPlace]
 
-      val ext = processor.getRecordExtractor(ReaderParameters(recordLength = Some(2), options = Map("test" -> "option")), copybook, stream)
+      val ext = CobolProcessorBase.getRecordExtractor(ReaderParameters(recordLength = Some(2), options = Map("test" -> "option")), copybook, stream)
 
       assert(ext.isInstanceOf[FixedRecordLengthRawRecordExtractor])
 
@@ -48,11 +45,8 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
 
     "work for an variable-record-length files" in {
       val stream = new ByteStreamMock(Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte))
-      val processor = CobolProcessor.builder
-        .withCopybookContents(copybook)
-        .build().asInstanceOf[CobolProcessorInPlace]
 
-      val ext = processor.getRecordExtractor(ReaderParameters(
+      val ext = CobolProcessorBase.getRecordExtractor(ReaderParameters(
         recordFormat = RecordFormat.VariableLength,
         isText = true
       ), copybook, stream)
@@ -62,12 +56,9 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
 
     "throw an exception on a non-supported record format for processing" in {
       val stream = new ByteStreamMock(Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte))
-      val processor = CobolProcessor.builder
-        .withCopybookContents(copybook)
-        .build().asInstanceOf[CobolProcessorInPlace]
 
       val ex = intercept[IllegalArgumentException] {
-        processor.getRecordExtractor(ReaderParameters(
+        CobolProcessorBase.getRecordExtractor(ReaderParameters(
           recordFormat = RecordFormat.VariableLength,
           isRecordSequence = true
         ), copybook, stream)
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -20,7 +20,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.slf4j.LoggerFactory
+import za.co.absa.cobrix.cobol.processor.impl.CobolProcessorBase
 import za.co.absa.cobrix.cobol.processor.{CobolProcessingStrategy, CobolProcessor, SerializableRawRecordProcessor}
+import za.co.absa.cobrix.cobol.reader.parameters.{CobolParametersParser, Parameters}
 import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
 import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
 import za.co.absa.cobrix.spark.cobol.utils.FileUtils
@@ -79,6 +81,22 @@ object SparkCobolProcessor {
       new SparkCobolProcessorLoader(filePaths, copybookContentsOpt.get, rawRecordProcessorOpt.get, cobolProcessingStrategy, numberOfThreads, caseInsensitiveOptions.toMap)
     }
 
+    def toRDD(path: String): RDD[Array[Byte]] = {
+      val filePaths = FileUtils
+        .getFiles(path, spark.sparkContext.hadoopConfiguration)
+
+      toRDD(filePaths)
+    }
+
+    def toRDD(filePaths: Seq[String]): RDD[Array[Byte]] = {
+      if (copybookContentsOpt.isEmpty) {
+        throw new IllegalArgumentException("Copybook contents must be provided.")
+      }
+
+      val sconf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
+      getRecordRdd(filePaths, copybookContentsOpt.get, caseInsensitiveOptions.toMap, sconf)
+    }
+
     def withCopybookContents(copybookContents: String): SparkCobolProcessorBuilder = {
       copybookContentsOpt = Option(copybookContents)
       this
@@ -174,6 +192,23 @@ object SparkCobolProcessor {
     })
   }
 
+  private def getRecordRdd(listOfFiles: Seq[String],
+                           copybookContents: String,
+                           options: Map[String, String],
+                           sconf: SerializableConfiguration)(implicit spark: SparkSession): RDD[Array[Byte]] = {
+
+    val cobolParameters = CobolParametersParser.parse(new Parameters(options))
+    val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
+
+    spark.sparkContext.parallelize(listOfFiles).flatMap { inputFile =>
+      val hadoopConfig = sconf.value
+      val inputFs = new Path(inputFile).getFileSystem(hadoopConfig)
+      val ifs = new FileStreamer(inputFile, inputFs)
+
+      CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, ifs)
+    }
+  }
+
   private def processListOfFiles(listOfFiles: Seq[String],
                                  outputPath: String,
                                  copybookContents: String,
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala
@@ -87,7 +87,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
 
         SparkCobolProcessor.builder
           .withCopybookContents(copybook)
-          .withRecordProcessor (new SerializableRawRecordProcessor {
+          .withRecordProcessor(new SerializableRawRecordProcessor {
             override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {
               record.map(v => (v - 1).toByte)
             }
@@ -119,7 +119,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
         SparkCobolProcessor.builder
           .withCopybookContents(copybook)
           .withProcessingStrategy(CobolProcessingStrategy.ToVariableLength)
-          .withRecordProcessor (new SerializableRawRecordProcessor {
+          .withRecordProcessor(new SerializableRawRecordProcessor {
             override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {
               record.map(v => (v - 1).toByte)
             }
@@ -148,4 +148,32 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
       }
     }
   }
+
+  "convert input format into an RDD" in {
+    val expected = """-13, -14, -15"""
+    withTempDirectory("spark_cobol_processor") { tempDir =>
+      val binData = Array(0xF1, 0xF2, 0xF3, 0xF1).map(_.toByte)
+
+      val inputPath = new Path(tempDir, "input.dat").toString
+      val outputPath = new Path(tempDir, "output").toString
+
+      writeBinaryFile(inputPath, binData)
+
+      val rdd = SparkCobolProcessor.builder
+        .withCopybookContents(copybook)
+        .toRDD(inputPath)
+
+      val count = rdd.count()
+
+      assert(count == 4)
+
+      val actual = rdd
+        .map(row => row.mkString)
+        .distinct
+        .sortBy(x => x)
+        .collect().mkString(", ")
+
+      assert(actual == expected)
+    }
+  }
 }