#805 Implement index caching for VRL files for faster processing when same files need to be processed multiple times.

yruslan · yruslan · commit de556c205e60 · 2025-11-23T17:24:04.000+01:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -119,6 +119,7 @@ object CobolParametersParser extends Logging {
 
   // Indexed multisegment file processing
   val PARAM_ENABLE_INDEXES            = "enable_indexes"
+  val PARAM_ENABLE_INDEX_CACHE        = "enable_index_cache"
   val PARAM_INPUT_SPLIT_RECORDS       = "input_split_records"
   val PARAM_INPUT_SPLIT_SIZE_MB       = "input_split_size_mb"
   val PARAM_SEGMENT_ID_PREFIX         = "segment_id_prefix"
@@ -381,6 +382,7 @@ object CobolParametersParser extends Logging {
                                  fileEndOffset = 0,
                                  generateRecordId = false,
                                  isUsingIndex = false,
+                                 isIndexCachingAllowed = false,
                                  inputSplitRecords = None,
                                  inputSplitSizeMB = None,
                                  improveLocality = false,
@@ -416,6 +418,7 @@ object CobolParametersParser extends Logging {
       isRdwPartRecLength = varLenParams.isRdwPartRecLength,
       rdwAdjustment = varLenParams.rdwAdjustment,
       isIndexGenerationNeeded = varLenParams.isUsingIndex,
+      isIndexCachingAllowed = varLenParams.isIndexCachingAllowed,
       inputSplitRecords = varLenParams.inputSplitRecords,
       inputSplitSizeMB = varLenParams.inputSplitSizeMB,
       hdfsDefaultBlockSize = defaultBlockSize,
@@ -502,6 +505,7 @@ object CobolParametersParser extends Logging {
         fileEndOffset,
         isRecordIdGenerationEnabled,
         params.getOrElse(PARAM_ENABLE_INDEXES, "true").toBoolean,
+        params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "true").toBoolean,
         params.get(PARAM_INPUT_SPLIT_RECORDS).map(v => v.toInt),
         params.get(PARAM_INPUT_SPLIT_SIZE_MB).map(v => v.toInt),
         params.getOrElse(PARAM_IMPROVE_LOCALITY, "true").toBoolean,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -99,6 +99,7 @@ case class ReaderParameters(
                              isRdwPartRecLength:      Boolean = false,
                              rdwAdjustment:           Int = 0,
                              isIndexGenerationNeeded: Boolean = false,
+                             isIndexCachingAllowed:   Boolean = false,
                              inputSplitRecords:       Option[Int] = None,
                              inputSplitSizeMB:        Option[Int] = None,
                              hdfsDefaultBlockSize:    Option[Int] = None,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/VariableLengthParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/VariableLengthParameters.scala
@@ -34,6 +34,7 @@ package za.co.absa.cobrix.cobol.reader.parameters
   * @param fileEndOffset          A number of bytes to skip at the end of each file
   * @param generateRecordId       Generate a sequential record number for each record to be able to retain the order of the original data
   * @param isUsingIndex           Is indexing input file before processing is requested
+  * @param isIndexCachingAllowed  Is caching of generated index allowed
   * @param inputSplitSizeMB       A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
   * @param inputSplitRecords      The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
   * @param improveLocality        Tries to improve locality by extracting preferred locations for variable-length records
@@ -56,6 +57,7 @@ case class VariableLengthParameters(
                                      fileEndOffset:         Int,
                                      generateRecordId:      Boolean,
                                      isUsingIndex:          Boolean,
+                                     isIndexCachingAllowed: Boolean,
                                      inputSplitRecords:     Option[Int],
                                      inputSplitSizeMB:      Option[Int],
                                      improveLocality:       Boolean,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -218,7 +218,7 @@ object SparkCobolProcessor {
       case reader: VarLenReader if reader.isIndexGenerationNeeded && allowIndexes =>
         val orderedFiles = CobolRelation.getListFilesWithOrder(listOfFiles, spark.sqlContext, isRecursiveRetrieval = false)
         val filesMap = orderedFiles.map(fileWithOrder => (fileWithOrder.order, fileWithOrder.filePath)).toMap
-        val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(orderedFiles, cobolReader, spark.sqlContext)(LocalityParameters(improveLocality = false, optimizeAllocation = false))
+        val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(orderedFiles, cobolReader, spark.sqlContext, readerParameters.isIndexCachingAllowed)(LocalityParameters(improveLocality = false, optimizeAllocation = false))
 
         indexes.flatMap(indexEntry => {
           val filePathName = filesMap(indexEntry.fileId)
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala
@@ -65,15 +65,16 @@ class SerializableConfiguration(@transient var value: Configuration) extends Ser
 class CobolRelation(sourceDirs: Seq[String],
                     cobolReader: Reader,
                     localityParams: LocalityParameters,
-                    debugIgnoreFileSize: Boolean
-                   )(@transient val sqlContext: SQLContext)
+                    debugIgnoreFileSize: Boolean,
+                    indexCachingAllowed: Boolean)
+                   (@transient val sqlContext: SQLContext)
   extends BaseRelation
     with Serializable
     with TableScan {
 
   private val filesList = CobolRelation.getListFilesWithOrder(sourceDirs, sqlContext, isRecursiveRetrieval)
 
-  private lazy val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(filesList, cobolReader, sqlContext)(localityParams)
+  private lazy val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(filesList, cobolReader, sqlContext, indexCachingAllowed)(localityParams)
 
   override def schema: StructType = {
     cobolReader.getSparkSchema
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -58,11 +58,16 @@ class DefaultSource
 
     val cobolParameters = CobolParametersParser.parse(new Parameters(parameters))
     CobolParametersValidator.checkSanity(cobolParameters)
+    val indexCachingAllowed = cobolParameters.variableLengthParams match {
+      case Some(varLenParams) => varLenParams.isIndexCachingAllowed
+      case None => false
+    }
 
     new CobolRelation(cobolParameters.sourcePaths,
       buildEitherReader(sqlContext.sparkSession, cobolParameters),
       LocalityParameters.extract(cobolParameters),
-      cobolParameters.debugIgnoreFileSize)(sqlContext)
+      cobolParameters.debugIgnoreFileSize,
+      indexCachingAllowed)(sqlContext)
   }
 
   /** Writer relation */
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala
@@ -34,6 +34,7 @@ import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
 import za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder
 import za.co.absa.cobrix.spark.cobol.utils.{HDFSUtils, SparkUtils}
 
+import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable.ArrayBuffer
 
 /**
@@ -45,18 +46,24 @@ import scala.collection.mutable.ArrayBuffer
   * In a nutshell, ideally, there will be as many partitions as are there are indexes.
   */
 private[cobol] object IndexBuilder extends Logging {
+  private val indexCache = new ConcurrentHashMap[String, Array[SparseIndexEntry]]()
+
   def buildIndex(filesList: Array[FileWithOrder],
                  cobolReader: Reader,
-                 sqlContext: SQLContext)
+                 sqlContext: SQLContext,
+                 cachingAllowed: Boolean)
                 (localityParams: LocalityParameters): RDD[SparseIndexEntry] = {
     val fs = new Path(filesList.head.filePath).getFileSystem(sqlContext.sparkSession.sparkContext.hadoopConfiguration)
 
     cobolReader match {
       case reader: VarLenReader if reader.isIndexGenerationNeeded && localityParams.improveLocality && isDataLocalitySupported(fs) =>
+        logger.info("Building indexes with data locality...")
         buildIndexForVarLenReaderWithFullLocality(filesList, reader, sqlContext, localityParams.optimizeAllocation)
       case reader: VarLenReader                                                                                                    =>
-        buildIndexForVarLenReader(filesList, reader, sqlContext)
+        logger.info("Building indexes for variable record length input files...")
+        buildIndexForVarLenReader(filesList, reader, sqlContext, cachingAllowed)
       case _                                                                                                                       =>
+        logger.info("Generating indexes for full files...")
         buildIndexForFullFiles(filesList, sqlContext)
     }
   }
@@ -112,24 +119,58 @@ private[cobol] object IndexBuilder extends Logging {
     */
   private[cobol] def buildIndexForVarLenReader(filesList: Array[FileWithOrder],
                                                reader: VarLenReader,
-                                               sqlContext: SQLContext): RDD[SparseIndexEntry] = {
+                                               sqlContext: SQLContext,
+                                               cachingAllowed: Boolean): RDD[SparseIndexEntry] = {
     val conf = sqlContext.sparkContext.hadoopConfiguration
     val sconf = new SerializableConfiguration(conf)
 
-    if (reader.getReaderProperties.enableSelfChecks && filesList.nonEmpty) {
-      selfCheckForIndexCompatibility(reader, filesList.head.filePath, conf)
+    // Splitting between files for which indexes are chached and teh list of files for which indexes are not cached
+    val cachedFiles = if (cachingAllowed) {
+      filesList.filter(f => indexCache.containsKey(f.filePath))
+    } else {
+      Array.empty[FileWithOrder]
     }
 
-    val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
+    val nonCachedFiles = filesList.diff(cachedFiles)
 
-    val indexRDD = filesRDD.mapPartitions(
-      partition => {
-        partition.flatMap(row => {
-          generateIndexEntry(row, sconf.value, reader)
-        })
-      }).cache()
+    // Getting indexes for files for which indexes are not in the cache
+    val newIndexes = if (nonCachedFiles.length > 0) {
+      if (reader.getReaderProperties.enableSelfChecks) {
+        selfCheckForIndexCompatibility(reader, nonCachedFiles.head.filePath, conf)
+      }
 
-    repartitionIndexes(indexRDD)
+      val filesRDD = sqlContext.sparkContext.parallelize(nonCachedFiles, nonCachedFiles.length)
+      filesRDD.mapPartitions(
+        partition => {
+          partition.flatMap(row => {
+            generateIndexEntry(row, sconf.value, reader)
+          })
+        }).collect()
+    } else {
+      Array.empty[SparseIndexEntry]
+    }
+
+    // Storing new indexes in the cache
+    if (cachingAllowed && newIndexes.length > 0) {
+      newIndexes.groupBy(_.fileId).foreach { case (fileId, indexEntries) =>
+        val filePathOpt = filesList.find(_.order == fileId).map(_.filePath)
+
+        filePathOpt.foreach { filePath =>
+          logger.info(s"Index stored to cache for file: $filePath.")
+          indexCache.put(filePath, indexEntries.sortBy(_.offsetFrom))
+        }
+      }
+    }
+
+    // Getting indexes for files for which indexes are in the cache
+    val cachedIndexes = cachedFiles.flatMap { f =>
+      logger.info("Index fetched from cache for file: " + f.filePath)
+      indexCache.get(f.filePath)
+        .map(ind => ind.copy(fileId = f.order))
+    }
+
+    // Creating the final RDD with all indexes
+    createIndexRDD(cachedIndexes ++ newIndexes, sqlContext)
   }
 
   /**
@@ -336,4 +377,13 @@ private[cobol] object IndexBuilder extends Logging {
     logger.info(s"Index elements count: $indexCount, number of partitions = $numPartitions")
     indexRDD.repartition(numPartitions).cache()
   }
+
+  private def createIndexRDD(indexes: Array[SparseIndexEntry], sqlContext: SQLContext): RDD[SparseIndexEntry] = {
+    val indexCount = indexes.length
+
+    val numPartitions = Math.min(indexCount, Constants.maxNumPartitions)
+    logger.info(s"Index elements count: ${indexes.length}, number of partitions = $numPartitions")
+
+    sqlContext.sparkContext.parallelize(indexes, numPartitions)
+  }
 }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala
@@ -83,6 +83,7 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
     if (numberOfBytes <= 0) {
       new Array[Byte](0)
     } else if (actualBytesToRead <=0 || bufferedStream == null || bufferedStream.isClosed) {
+      logger.info(s"End of stream reached: Requested $numberOfBytes bytes, reached offset $byteIndex.")
       close()
       new Array[Byte](0)
     } else {
@@ -97,7 +98,7 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
       if (readBytes == numberOfBytes) {
         buffer
       } else {
-        logger.warn(s"End of stream reached: Requested $numberOfBytes bytes, received $readBytes.")
+        logger.info(s"End of stream reached: Requested $numberOfBytes bytes, received $readBytes.")
         close()
         if (readBytes == actualBytesToRead) {
           buffer
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelationSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelationSpec.scala
@@ -64,7 +64,8 @@ class CobolRelationSpec extends SparkCobolTestBase with Serializable {
     val relation = new CobolRelation(Seq(copybookFile.getParentFile.getAbsolutePath),
       testReader,
       localityParams = localityParams,
-      debugIgnoreFileSize = false)(sqlContext)
+      debugIgnoreFileSize = false,
+      indexCachingAllowed = false)(sqlContext)
     val cobolData: RDD[Row] = relation.parseRecords(testReader, oneRowRDD)
 
     val cobolDataFrame = sqlContext.createDataFrame(cobolData, sparkSchema)
@@ -88,7 +89,8 @@ class CobolRelationSpec extends SparkCobolTestBase with Serializable {
     val relation = new CobolRelation(Seq(copybookFile.getParentFile.getAbsolutePath),
       testReader,
       localityParams = localityParams,
-      debugIgnoreFileSize = false)(sqlContext)
+      debugIgnoreFileSize = false,
+      indexCachingAllowed = false)(sqlContext)
 
     val caught = intercept[Exception] {
       relation.parseRecords(testReader, oneRowRDD).collect()
@@ -103,7 +105,8 @@ class CobolRelationSpec extends SparkCobolTestBase with Serializable {
     val relation = new CobolRelation(Seq(copybookFile.getParentFile.getAbsolutePath),
       testReader,
       localityParams = localityParams,
-      debugIgnoreFileSize = false)(sqlContext)
+      debugIgnoreFileSize = false,
+      indexCachingAllowed = false)(sqlContext)
 
     val caught = intercept[SparkException] {
       relation.parseRecords(testReader, oneRowRDD).collect()
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilderSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilderSpec.scala
@@ -64,7 +64,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest
 
         val localityParameters = LocalityParameters(improveLocality = true, optimizeAllocation = true)
 
-        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()
+        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()
 
         assert(index.length == 3)
       }
@@ -86,7 +86,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest
 
         val localityParameters = LocalityParameters(improveLocality = false, optimizeAllocation = false)
 
-        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()
+        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()
 
         assert(index.length == 3)
       }
@@ -104,7 +104,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest
 
         val localityParameters = LocalityParameters(improveLocality = false, optimizeAllocation = false)
 
-        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()
+        val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()
 
         assert(index.length == 1)
       }
@@ -168,7 +168,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest
 
         val reader = new VarLenNestedReader(Seq(copybook), readerParameters)
 
-        val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext).collect()
+        val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false).collect()
 
         assert(index.length == 3)
       }
@@ -188,7 +188,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest
 
         val reader = new VarLenNestedReader(Seq(copybook), readerParameters)
 
-        val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext).collect()
+        val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false).collect()
 
         assert(index.length == 2)
       }

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest`
`64`	`64`
`65`	`65`	`val localityParameters = LocalityParameters(improveLocality = true, optimizeAllocation = true)`
`66`	`66`
`67`		`- val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()`
	`67`	`+ val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()`
`68`	`68`
`69`	`69`	`assert(index.length == 3)`
`70`	`70`	`}`
`@@ -86,7 +86,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest`
`86`	`86`
`87`	`87`	`val localityParameters = LocalityParameters(improveLocality = false, optimizeAllocation = false)`
`88`	`88`
`89`		`- val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()`
	`89`	`+ val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()`
`90`	`90`
`91`	`91`	`assert(index.length == 3)`
`92`	`92`	`}`
`@@ -104,7 +104,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest`
`104`	`104`
`105`	`105`	`val localityParameters = LocalityParameters(improveLocality = false, optimizeAllocation = false)`
`106`	`106`
`107`		`- val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext)(localityParameters).collect()`
	`107`	`+ val index = IndexBuilder.buildIndex(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false)(localityParameters).collect()`
`108`	`108`
`109`	`109`	`assert(index.length == 1)`
`110`	`110`	`}`
`@@ -168,7 +168,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest`
`168`	`168`
`169`	`169`	`val reader = new VarLenNestedReader(Seq(copybook), readerParameters)`
`170`	`170`
`171`		`- val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext).collect()`
	`171`	`+ val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false).collect()`
`172`	`172`
`173`	`173`	`assert(index.length == 3)`
`174`	`174`	`}`
`@@ -188,7 +188,7 @@ class IndexBuilderSpec extends AnyWordSpec with BinaryFileFixture with SparkTest`
`188`	`188`
`189`	`189`	`val reader = new VarLenNestedReader(Seq(copybook), readerParameters)`
`190`	`190`
`191`		`- val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext).collect()`
	`191`	`+ val index = IndexBuilder.buildIndexForVarLenReader(filesWithOrder, reader, spark.sqlContext, cachingAllowed = false).collect()`
`192`	`192`
`193`	`193`	`assert(index.length == 2)`
`194`	`194`	`}`