#795 Add support for indexes when processing raw records via RDDs.

yruslan · yruslan · commit 9d9c23dc1eea · 2025-11-13T09:31:49.000+01:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBase.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBase.scala
@@ -30,9 +30,12 @@ import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
 abstract class CobolProcessorBase extends CobolProcessor with Serializable
 
 object CobolProcessorBase {
-  def getRecordExtractor(readerParameters: ReaderParameters, copybookContents: String, inputStream: SimpleStream): RawRecordExtractor = {
+  def getRecordExtractor(readerParameters: ReaderParameters, copybookContents: String, inputStream: SimpleStream, headerStreamOpt: Option[SimpleStream]): RawRecordExtractor = {
     val dataStream = inputStream.copyStream()
-    val headerStream = inputStream.copyStream()
+    val headerStream = headerStreamOpt match {
+      case Some(stream) => stream
+      case None => inputStream.copyStream()
+    }
 
     val reader = new VarLenNestedReader[Array[Any]](Seq(copybookContents), readerParameters, new ArrayOfAnyHandler)
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorInPlace.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorInPlace.scala
@@ -44,7 +44,7 @@ class CobolProcessorInPlace(readerParameters: ReaderParameters,
   override def process(inputStream: SimpleStream,
                        outputStream: OutputStream)
                       (rawRecordProcessor: RawRecordProcessor): Long = {
-    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream)
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream, None)
 
     val dataStream = inputStream.copyStream()
     try {
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorToRdw.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorToRdw.scala
@@ -42,7 +42,7 @@ class CobolProcessorToRdw(readerParameters: ReaderParameters,
   override def process(inputStream: SimpleStream,
                        outputStream: OutputStream)
                       (rawRecordProcessor: RawRecordProcessor): Long = {
-    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream)
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream, None)
 
     StreamProcessor.processStreamToRdw(copybook,
       options,
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBaseSuite.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/processor/impl/CobolProcessorBaseSuite.scala
@@ -33,7 +33,7 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
     "work for an fixed-record-length files" in {
       val stream = new ByteStreamMock(Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte))
 
-      val ext = CobolProcessorBase.getRecordExtractor(ReaderParameters(recordLength = Some(2), options = Map("test" -> "option")), copybook, stream)
+      val ext = CobolProcessorBase.getRecordExtractor(ReaderParameters(recordLength = Some(2), options = Map("test" -> "option")), copybook, stream, None)
 
       assert(ext.isInstanceOf[FixedRecordLengthRawRecordExtractor])
 
@@ -49,7 +49,7 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
       val ext = CobolProcessorBase.getRecordExtractor(ReaderParameters(
         recordFormat = RecordFormat.VariableLength,
         isText = true
-      ), copybook, stream)
+      ), copybook, stream, None)
 
       assert(ext.isInstanceOf[TextFullRecordExtractor])
     }
@@ -61,7 +61,7 @@ class CobolProcessorBaseSuite extends AnyWordSpec {
         CobolProcessorBase.getRecordExtractor(ReaderParameters(
           recordFormat = RecordFormat.VariableLength,
           isRecordSequence = true
-        ), copybook, stream)
+        ), copybook, stream, None)
       }
 
       assert(ex.getMessage.contains("Cannot create a record extractor for the given reader parameters."))
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -22,9 +22,15 @@ import org.apache.spark.sql.SparkSession
 import org.slf4j.LoggerFactory
 import za.co.absa.cobrix.cobol.processor.impl.CobolProcessorBase
 import za.co.absa.cobrix.cobol.processor.{CobolProcessingStrategy, CobolProcessor, SerializableRawRecordProcessor}
-import za.co.absa.cobrix.cobol.reader.parameters.{CobolParametersParser, Parameters}
-import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
+import za.co.absa.cobrix.cobol.reader.common.Constants
+import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
+import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser.PARAM_GENERATE_RECORD_ID
+import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
+import za.co.absa.cobrix.spark.cobol.reader.VarLenReader
+import za.co.absa.cobrix.spark.cobol.source.index.IndexBuilder
+import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
 import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
+import za.co.absa.cobrix.spark.cobol.source.{CobolRelation, DefaultSource, SerializableConfiguration}
 import za.co.absa.cobrix.spark.cobol.utils.FileUtils
 
 import java.io.BufferedOutputStream
@@ -45,6 +51,7 @@ trait SparkCobolProcessor {
 }
 
 object SparkCobolProcessor {
+  @transient
   private val log = LoggerFactory.getLogger(this.getClass)
 
   class SparkCobolProcessorBuilder(implicit spark: SparkSession) {
@@ -197,15 +204,45 @@ object SparkCobolProcessor {
                            options: Map[String, String],
                            sconf: SerializableConfiguration)(implicit spark: SparkSession): RDD[Array[Byte]] = {
 
-    val cobolParameters = CobolParametersParser.parse(new Parameters(options))
-    val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
+    val varLenOptions = options + (PARAM_GENERATE_RECORD_ID -> "true")
 
-    spark.sparkContext.parallelize(listOfFiles).flatMap { inputFile =>
-      val hadoopConfig = sconf.value
-      val inputFs = new Path(inputFile).getFileSystem(hadoopConfig)
-      val ifs = new FileStreamer(inputFile, inputFs)
+    val cobolParameters: CobolParameters = CobolParametersParser.parse(new Parameters(varLenOptions))
+      .copy(sourcePaths = listOfFiles, copybookContent = Option(copybookContents))
 
-      CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, ifs)
+    val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
+    val cobolReader = DefaultSource.createVariableLengthReader(cobolParameters, spark)
+    val allowIndexes = readerParameters.isIndexGenerationNeeded
+
+    cobolReader match {
+      case reader: VarLenReader if reader.isIndexGenerationNeeded && allowIndexes =>
+        val orderedFiles = CobolRelation.getListFilesWithOrder(listOfFiles, spark.sqlContext, isRecursiveRetrieval = false)
+        val filesMap = orderedFiles.map(fileWithOrder => (fileWithOrder.order, fileWithOrder.filePath)).toMap
+        val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(orderedFiles, cobolReader, spark.sqlContext)(LocalityParameters(improveLocality = false, optimizeAllocation = false))
+
+        indexes.flatMap(indexEntry => {
+          val filePathName = filesMap(indexEntry.fileId)
+          val path = new Path(filePathName)
+          val fileSystem = path.getFileSystem(sconf.value)
+          val fileName = path.getName
+          val numOfBytes = if (indexEntry.offsetTo > 0L) indexEntry.offsetTo - indexEntry.offsetFrom else 0L
+          val numOfBytesMsg = if (numOfBytes > 0) s"${numOfBytes / Constants.megabyte} MB" else "until the end"
+
+          log.info(s"Going to process offsets ${indexEntry.offsetFrom}...${indexEntry.offsetTo} ($numOfBytesMsg) of $fileName")
+          val dataStream = new FileStreamer(filePathName, fileSystem, indexEntry.offsetFrom, numOfBytes)
+          val headerStream = new FileStreamer(filePathName, fileSystem)
+
+          CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, dataStream, Some(headerStream))
+        })
+
+      case _ =>
+        spark.sparkContext.parallelize(listOfFiles).flatMap { inputFile =>
+          val hadoopConfig = sconf.value
+          log.info(s"Going to process data from $inputFile")
+          val inputFs = new Path(inputFile).getFileSystem(hadoopConfig)
+          val ifs = new FileStreamer(inputFile, inputFs)
+
+          CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, ifs, None)
+        }
     }
   }
 
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala
@@ -71,7 +71,7 @@ class CobolRelation(sourceDirs: Seq[String],
     with Serializable
     with TableScan {
 
-  private val filesList = getListFilesWithOrder(sourceDirs)
+  private val filesList = CobolRelation.getListFilesWithOrder(sourceDirs, sqlContext, isRecursiveRetrieval)
 
   private lazy val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(filesList, cobolReader, sqlContext)(localityParams)
 
@@ -94,23 +94,6 @@ class CobolRelation(sourceDirs: Seq[String],
     }
   }
 
-  /**
-    * Retrieves a list containing the files contained in the directory to be processed attached to numbers which serve
-    * as their order.
-    *
-    * The List contains [[za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder]] instances.
-    */
-  private def getListFilesWithOrder(sourceDirs: Seq[String]): Array[FileWithOrder] = {
-    val allFiles = sourceDirs.flatMap(sourceDir => {
-      FileUtils
-        .getFiles(sourceDir, sqlContext.sparkContext.hadoopConfiguration, isRecursiveRetrieval)
-    }).toArray
-
-    allFiles
-      .zipWithIndex
-      .map(file => FileWithOrder(file._1, file._2))
-  }
-
   /**
     * Checks if the recursive file retrieval flag is set
     */
@@ -127,4 +110,23 @@ class CobolRelation(sourceDirs: Seq[String],
       }
     })
   }
+}
+
+object CobolRelation {
+  /**
+    * Retrieves a list containing the files contained in the directory to be processed attached to numbers which serve
+    * as their order.
+    *
+    * The List contains [[za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder]] instances.
+    */
+  def getListFilesWithOrder(sourceDirs: Seq[String], sqlContext: SQLContext, isRecursiveRetrieval: Boolean): Array[FileWithOrder] = {
+    val allFiles = sourceDirs.flatMap(sourceDir => {
+      FileUtils
+        .getFiles(sourceDir, sqlContext.sparkContext.hadoopConfiguration, isRecursiveRetrieval)
+    }).toArray
+
+    allFiles
+      .zipWithIndex
+      .map(file => FileWithOrder(file._1, file._2))
+  }
 }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.io.{BytesWritable, NullWritable}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
 import za.co.absa.cobrix.cobol.internal.Logging
 import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser._
 import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
@@ -41,6 +42,7 @@ class DefaultSource
     with DataSourceRegister
     with ReaderFactory
     with Logging {
+  import DefaultSource._
 
   override def shortName(): String = SHORT_NAME
 
@@ -124,13 +126,17 @@ class DefaultSource
 
   //TODO fix with the correct implementation once the correct Reader hierarchy is put in place.
   override def buildReader(spark: SparkSession, parameters: Map[String, String]): FixedLenReader = null
+}
+
+object DefaultSource {
+  private val logger: Logger = LoggerFactory.getLogger(this.getClass)
 
   /**
     * Builds one of two Readers, depending on the parameters.
     *
     * This method will probably be removed once the correct hierarchy for [[FixedLenReader]] is put in place.
     */
-  private def buildEitherReader(spark: SparkSession, cobolParameters: CobolParameters): Reader = {
+  def buildEitherReader(spark: SparkSession, cobolParameters: CobolParameters): Reader = {
     val reader = if (cobolParameters.isText && cobolParameters.variableLengthParams.isEmpty) {
       createTextReader(cobolParameters, spark)
     } else if (cobolParameters.variableLengthParams.isEmpty) {
@@ -148,7 +154,7 @@ class DefaultSource
   /**
     * Creates a Reader that knows how to consume text Cobol records.
     */
-  private def createTextReader(parameters: CobolParameters, spark: SparkSession): FixedLenReader = {
+  def createTextReader(parameters: CobolParameters, spark: SparkSession): FixedLenReader = {
     val copybookContent = CopybookContentLoader.load(parameters, spark.sparkContext.hadoopConfiguration)
     val defaultHdfsBlockSize = SparkUtils.getDefaultHdfsBlockSize(spark, parameters.sourcePaths.headOption)
     new FixedLenTextReader(copybookContent,  getReaderProperties(parameters, defaultHdfsBlockSize)
@@ -158,7 +164,7 @@ class DefaultSource
   /**
     * Creates a Reader that knows how to consume fixed-length Cobol records.
     */
-  private def createFixedLengthReader(parameters: CobolParameters, spark: SparkSession): FixedLenReader = {
+  def createFixedLengthReader(parameters: CobolParameters, spark: SparkSession): FixedLenReader = {
 
     val copybookContent = CopybookContentLoader.load(parameters, spark.sparkContext.hadoopConfiguration)
     val defaultHdfsBlockSize = SparkUtils.getDefaultHdfsBlockSize(spark, parameters.sourcePaths.headOption)
@@ -171,7 +177,7 @@ class DefaultSource
     *
     * The variable-length reading process is approached as if reading from a stream.
     */
-  private def createVariableLengthReader(parameters: CobolParameters, spark: SparkSession): VarLenReader = {
+  def createVariableLengthReader(parameters: CobolParameters, spark: SparkSession): VarLenReader = {
 
 
     val copybookContent = CopybookContentLoader.load(parameters, spark.sparkContext.hadoopConfiguration)
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala
@@ -44,7 +44,7 @@ import scala.collection.mutable.ArrayBuffer
   *
   * In a nutshell, ideally, there will be as many partitions as are there are indexes.
   */
-private[source] object IndexBuilder extends Logging {
+private[cobol] object IndexBuilder extends Logging {
   def buildIndex(filesList: Array[FileWithOrder],
                  cobolReader: Reader,
                  sqlContext: SQLContext)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala
@@ -149,7 +149,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
     }
   }
 
-  "convert input format into an RDD" in {
+  "convert input format into an RDD without indexes" in {
     val expected = """-13, -14, -15"""
     withTempDirectory("spark_cobol_processor") { tempDir =>
       val binData = Array(0xF1, 0xF2, 0xF3, 0xF1).map(_.toByte)
@@ -161,6 +161,7 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
 
       val rdd = SparkCobolProcessor.builder
         .withCopybookContents(copybook)
+        .option("enable_indexes", "false")
         .toRDD(inputPath)
 
       val count = rdd.count()
@@ -176,4 +177,34 @@ class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with Binar
       assert(actual == expected)
     }
   }
+
+  "convert input format into an RDD with index" in {
+    val expected = """-10, -11, -12, -13, -14, -15"""
+    withTempDirectory("spark_cobol_processor") { tempDir =>
+      val binData = Array(0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF1).map(_.toByte)
+
+      val inputPath = new Path(tempDir, "input.dat").toString
+      val outputPath = new Path(tempDir, "output").toString
+
+      writeBinaryFile(inputPath, binData)
+
+      val rdd = SparkCobolProcessor.builder
+        .withCopybookContents(copybook)
+        .option("enable_indexes", "true")
+        .option("input_split_records", "2")
+        .toRDD(inputPath)
+
+      val count = rdd.count()
+
+      assert(count == 7)
+
+      val actual = rdd
+        .map(row => row.mkString)
+        .distinct
+        .sortBy(x => x)
+        .collect().mkString(", ")
+
+      assert(actual == expected)
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ import scala.collection.mutable.ArrayBuffer`
`44`	`44`	`*`
`45`	`45`	`* In a nutshell, ideally, there will be as many partitions as are there are indexes.`
`46`	`46`	`*/`
`47`		`-private[source] object IndexBuilder extends Logging {`
	`47`	`+private[cobol] object IndexBuilder extends Logging {`
`48`	`48`	`def buildIndex(filesList: Array[FileWithOrder],`
`49`	`49`	`cobolReader: Reader,`
`50`	`50`	`sqlContext: SQLContext)`