#809 Add support for indexes in compressed files.

yruslan · yruslan · commit 0dc8c2d15289 · 2025-12-19T13:40:58.000+01:00
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ Among the motivations for this project, it is possible to highlight:
 
 - The COBOL copybooks parser doesn't have a Spark dependency and can be reused for integrating into other data processing engines.
 
-- Supports reading files compressed in Hadoop-compatible way (gzip, bzip2, etc), but with limited parallelism (only per-file parallelism). 
+- Supports reading files compressed in Hadoop-compatible way (gzip, bzip2, etc), but with limited parallelism. 
   Uncompressed files are preferred for performance. 
 
 ## Videos
@@ -1605,7 +1605,7 @@ The output looks like this:
 | .option("redefine-segment-id-map:0", "REDEFINED_FIELD1 => SegmentId1,SegmentId2,...") | Specifies a mapping between redefined field names and segment id values. Each option specifies a mapping for a single segment. The numeric value for each mapping option must be incremented so the option keys are unique.                                                                                                                                                        |
 | .option("segment-children:0", "COMPANY => EMPLOYEE,DEPARTMENT")                       | Specifies a mapping between segment redefined fields and their children. Each option specifies a mapping for a single parent field. The numeric value for each mapping option must be incremented so the option keys are unique. If such mapping is specified hierarchical record structure will be automatically reconstructed. This require `redefine-segment-id-map` to be set. | 
 | .option("enable_indexes", "true")                                                     | Turns on indexing of multisegment variable length files (on by default).                                                                                                                                                                                                                                                                                                           |
-| .option("enable_index_cache", "false")                                                | When true, calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once.                                                                                                                                                                                                                            |
+| .option("enable_index_cache", "true")                                                 | When true (default), calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once.                                                                                                                                                                                                                  |
 | .option("input_split_records", 50000)                                                 | Specifies how many records will be allocated to each split/partition. It will be processed by Spark tasks. (The default is not set and the split will happen according to size, see the next option)                                                                                                                                                                               |
 | .option("input_split_size_mb", 100)                                                   | Specify how many megabytes to allocate to each partition/split. (The default is 100 MB)                                                                                                                                                                                                                                                                                            |
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala
@@ -144,7 +144,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
                              fileNumber: Int,
                              isRdwBigEndian: Boolean): ArrayBuffer[SparseIndexEntry] = {
     val inputSplitSizeRecords: Option[Int] = readerProperties.inputSplitRecords
-    val inputSplitSizeMB: Option[Int] = getSplitSizeMB
+    val inputSplitSizeMB: Option[Int] = getSplitSizeMB(dataStream.isCompressed)
 
     if (inputSplitSizeRecords.isDefined) {
       if (inputSplitSizeRecords.get < 1 || inputSplitSizeRecords.get > 1000000000) {
@@ -153,7 +153,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
       logger.info(s"Input split size = ${inputSplitSizeRecords.get} records")
     } else {
       if (inputSplitSizeMB.nonEmpty) {
-        if (inputSplitSizeMB.get < 1 || inputSplitSizeMB.get > 2000) {
+        if (inputSplitSizeMB.get < 1 || inputSplitSizeMB.get > 200000) {
           throw new IllegalArgumentException(s"Invalid input split size of ${inputSplitSizeMB.get} MB.")
         }
         logger.info(s"Input split size = ${inputSplitSizeMB.get} MB")
@@ -214,11 +214,18 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
     }
   }
 
-  private def getSplitSizeMB: Option[Int] = {
-    if (readerProperties.inputSplitSizeMB.isDefined) {
-      readerProperties.inputSplitSizeMB
+  private def getSplitSizeMB(isCompressed: Boolean): Option[Int] = {
+    if (isCompressed) {
+      readerProperties.inputSplitSizeCompressedMB match {
+        case Some(size) => readerProperties.inputSplitSizeCompressedMB
+        case None => Some(1024)
+      }
     } else {
-      readerProperties.hdfsDefaultBlockSize
+      if (readerProperties.inputSplitSizeMB.isDefined) {
+        readerProperties.inputSplitSizeMB
+      } else {
+        readerProperties.hdfsDefaultBlockSize
+      }
     }
   }
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -118,13 +118,14 @@ object CobolParametersParser extends Logging {
   val PARAM_SEGMENT_REDEFINE_PREFIX_ALT = "redefine-segment-id-map"
 
   // Indexed multisegment file processing
-  val PARAM_ENABLE_INDEXES            = "enable_indexes"
-  val PARAM_ENABLE_INDEX_CACHE        = "enable_index_cache"
-  val PARAM_INPUT_SPLIT_RECORDS       = "input_split_records"
-  val PARAM_INPUT_SPLIT_SIZE_MB       = "input_split_size_mb"
-  val PARAM_SEGMENT_ID_PREFIX         = "segment_id_prefix"
-  val PARAM_OPTIMIZE_ALLOCATION       = "optimize_allocation"
-  val PARAM_IMPROVE_LOCALITY          = "improve_locality"
+  val PARAM_ENABLE_INDEXES                   = "enable_indexes"
+  val PARAM_ENABLE_INDEX_CACHE               = "enable_index_cache"
+  val PARAM_INPUT_SPLIT_RECORDS              = "input_split_records"
+  val PARAM_INPUT_SPLIT_SIZE_MB              = "input_split_size_mb"
+  val PARAM_INPUT_SPLIT_SIZE_COMPRESSED_MB   = "input_split_size_compressed_mb"
+  val PARAM_SEGMENT_ID_PREFIX                = "segment_id_prefix"
+  val PARAM_OPTIMIZE_ALLOCATION              = "optimize_allocation"
+  val PARAM_IMPROVE_LOCALITY                 = "improve_locality"
 
   // Parameters for debugging
   val PARAM_DEBUG_LAYOUT_POSITIONS    = "debug_layout_positions"
@@ -385,6 +386,7 @@ object CobolParametersParser extends Logging {
                                  isIndexCachingAllowed = false,
                                  inputSplitRecords = None,
                                  inputSplitSizeMB = None,
+                                 inputSplitSizeCompressedMB = None,
                                  improveLocality = false,
                                  optimizeAllocation = false,
                                  inputFileNameColumn = "",
@@ -421,6 +423,7 @@ object CobolParametersParser extends Logging {
       isIndexCachingAllowed = varLenParams.isIndexCachingAllowed,
       inputSplitRecords = varLenParams.inputSplitRecords,
       inputSplitSizeMB = varLenParams.inputSplitSizeMB,
+      inputSplitSizeCompressedMB = varLenParams.inputSplitSizeCompressedMB,
       hdfsDefaultBlockSize = defaultBlockSize,
       startOffset = parameters.recordStartOffset,
       endOffset = parameters.recordEndOffset,
@@ -508,6 +511,7 @@ object CobolParametersParser extends Logging {
         params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "false").toBoolean,
         params.get(PARAM_INPUT_SPLIT_RECORDS).map(v => v.toInt),
         params.get(PARAM_INPUT_SPLIT_SIZE_MB).map(v => v.toInt),
+        params.get(PARAM_INPUT_SPLIT_SIZE_COMPRESSED_MB).map(v => v.toInt),
         params.getOrElse(PARAM_IMPROVE_LOCALITY, "true").toBoolean,
         params.getOrElse(PARAM_OPTIMIZE_ALLOCATION, "false").toBoolean,
         params.getOrElse(PARAM_INPUT_FILE_COLUMN, ""),
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat
 import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
 import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
 import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
-import za.co.absa.cobrix.cobol.parser.policies.{CommentPolicy, DebugFieldsPolicy, FillerNamingPolicy, MetadataPolicy, StringTrimmingPolicy}
+import za.co.absa.cobrix.cobol.parser.policies._
 import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat
 import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat.FixedLength
 import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
@@ -50,6 +50,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
   * @param isIndexGenerationNeeded Is indexing input file before processing is requested
   * @param inputSplitRecords       The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
   * @param inputSplitSizeMB        A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
+  * @param inputSplitSizeCompressedMB A partition size to target for compressed files.
   * @param hdfsDefaultBlockSize    Default HDFS block size for the HDFS filesystem used. This value is used as the default split size if inputSplitSizeMB is not specified
   * @param startOffset             An offset to the start of the record in each binary data block.
   * @param endOffset               An offset from the end of the record to the end of the binary data block.
@@ -102,6 +103,7 @@ case class ReaderParameters(
                              isIndexCachingAllowed:   Boolean = false,
                              inputSplitRecords:       Option[Int] = None,
                              inputSplitSizeMB:        Option[Int] = None,
+                             inputSplitSizeCompressedMB: Option[Int] = None,
                              hdfsDefaultBlockSize:    Option[Int] = None,
                              startOffset:             Int = 0,
                              endOffset:               Int = 0,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/VariableLengthParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/VariableLengthParameters.scala
@@ -17,51 +17,53 @@
 package za.co.absa.cobrix.cobol.reader.parameters
 
 /**
-  * This class holds the parameters currently used for parsing variable-length records.
+  * This class is used to hold the parameters currently used for parsing variable-length records.
   *
-  * @param isRecordSequence       Does input files have 4 byte record length headers
-  * @param bdw                    Block descriptor word (if specified), for FB and VB record formats
-  * @param isRdwBigEndian         Is RDW big endian? It may depend on flavor of mainframe and/or mainframe to PC transfer method
-  * @param isRdwPartRecLength     Does RDW count itself as part of record length itself
-  * @param rdwAdjustment          Controls a mismatch between RDW and record length
-  * @param recordHeaderParser     An optional custom record header parser for non-standard RDWs
-  * @param recordExtractor        An optional custom raw record parser class non-standard record types
-  * @param rhpAdditionalInfo      An optional additional option string passed to a custom record header parser
-  * @param reAdditionalInfo       An optional additional option string passed to a custom record extractor
-  * @param recordLengthField      A field that stores record length
-  * @param recordLengthMap        A mapping between field value and record size.
-  * @param fileStartOffset        A number of bytes to skip at the beginning of each file
-  * @param fileEndOffset          A number of bytes to skip at the end of each file
-  * @param generateRecordId       Generate a sequential record number for each record to be able to retain the order of the original data
-  * @param isUsingIndex           Is indexing input file before processing is requested
-  * @param isIndexCachingAllowed  Is caching of generated index allowed
-  * @param inputSplitSizeMB       A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
-  * @param inputSplitRecords      The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
-  * @param improveLocality        Tries to improve locality by extracting preferred locations for variable-length records
-  * @param optimizeAllocation     Optimizes cluster usage in case of optimization for locality in the presence of new nodes (nodes that do not contain any blocks of the files being processed)
-  * @param inputFileNameColumn    A column name to add to the dataframe. The column will contain input file name for each record similar to 'input_file_name()' function
+  * @param isRecordSequence           Do input files have 4 byte record length headers
+  * @param bdw                        Block descriptor word (if specified), for FB and VB record formats
+  * @param isRdwBigEndian             Is RDW big endian? It may depend on flavor of mainframe and/or mainframe to PC transfer method
+  * @param isRdwPartRecLength         Does RDW count itself as part of record length itself
+  * @param rdwAdjustment              Controls a mismatch between RDW and record length
+  * @param recordHeaderParser         An optional custom record header parser for non-standard RDWs
+  * @param recordExtractor            An optional custom raw record parser class for non-standard record types
+  * @param rhpAdditionalInfo          An optional additional option string passed to a custom record header parser
+  * @param reAdditionalInfo           An optional additional option string passed to a custom record extractor
+  * @param recordLengthField          A field that stores record length
+  * @param recordLengthMap            A mapping between field value and record size.
+  * @param fileStartOffset            A number of bytes to skip at the beginning of each file
+  * @param fileEndOffset              A number of bytes to skip at the end of each file
+  * @param generateRecordId           Generate a sequential record number for each record to be able to retain the order of the original data
+  * @param isUsingIndex               Is indexing input file before processing is requested
+  * @param isIndexCachingAllowed      Is caching of generated index allowed
+  * @param inputSplitSizeMB           A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
+  * @param inputSplitSizeCompressedMB A partition size to target for compressed files.
+  * @param inputSplitRecords          The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
+  * @param improveLocality            Tries to improve locality by extracting preferred locations for variable-length records
+  * @param optimizeAllocation         Optimizes cluster usage in case of optimization for locality in the presence of new nodes (nodes that do not contain any blocks of the files being processed)
+  * @param inputFileNameColumn        A column name to add to the dataframe. The column will contain input file name for each record similar to 'input_file_name()' function
   */
 case class VariableLengthParameters(
-                                     isRecordSequence:      Boolean, // [deprecated by recordFormat]
-                                     bdw:                   Option[Bdw],
-                                     isRdwBigEndian:        Boolean,
-                                     isRdwPartRecLength:    Boolean,
-                                     rdwAdjustment:         Int,
-                                     recordHeaderParser:    Option[String],
-                                     recordExtractor:       Option[String],
-                                     rhpAdditionalInfo:     Option[String],
-                                     reAdditionalInfo:      String,
-                                     recordLengthField:     String,
-                                     recordLengthMap:       Map[String, Int],
-                                     fileStartOffset:       Int,
-                                     fileEndOffset:         Int,
-                                     generateRecordId:      Boolean,
-                                     isUsingIndex:          Boolean,
-                                     isIndexCachingAllowed: Boolean,
-                                     inputSplitRecords:     Option[Int],
-                                     inputSplitSizeMB:      Option[Int],
-                                     improveLocality:       Boolean,
-                                     optimizeAllocation:    Boolean,
-                                     inputFileNameColumn:   String,
-                                     occursMappings:        Map[String, Map[String, Int]]
+                                     isRecordSequence:           Boolean, // [deprecated by recordFormat]
+                                     bdw:                        Option[Bdw],
+                                     isRdwBigEndian:             Boolean,
+                                     isRdwPartRecLength:         Boolean,
+                                     rdwAdjustment:              Int,
+                                     recordHeaderParser:         Option[String],
+                                     recordExtractor:            Option[String],
+                                     rhpAdditionalInfo:          Option[String],
+                                     reAdditionalInfo:           String,
+                                     recordLengthField:          String,
+                                     recordLengthMap:            Map[String, Int],
+                                     fileStartOffset:            Int,
+                                     fileEndOffset:              Int,
+                                     generateRecordId:           Boolean,
+                                     isUsingIndex:               Boolean,
+                                     isIndexCachingAllowed:      Boolean,
+                                     inputSplitRecords:          Option[Int],
+                                     inputSplitSizeMB:           Option[Int],
+                                     inputSplitSizeCompressedMB: Option[Int],
+                                     improveLocality:            Boolean,
+                                     optimizeAllocation:         Boolean,
+                                     inputFileNameColumn:        String,
+                                     occursMappings:             Map[String, Map[String, Int]]
                                    )
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala
@@ -205,12 +205,7 @@ private[cobol] object IndexBuilder extends Logging {
 
     val (inputStream, headerStream, maximumBytes) = getStreams(filePath, startOffset, endOffset, config)
     val index = try {
-      if (inputStream.isCompressed) {
-        val element = SparseIndexEntry(0, -1, fileOrder, 0L)
-        ArrayBuffer[SparseIndexEntry](element)
-      } else {
-        reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
-      }
+      reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
     } finally {
       inputStream.close()
       headerStream.close()
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/BufferedFSDataInputStream.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/BufferedFSDataInputStream.scala
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test37RecordLengthMappingSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test37RecordLengthMappingSpec.scala
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test40CompressesFilesSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test40CompressesFilesSpec.scala