#805 Make index caching disabled by default for now. Add the info about the option to README.

yruslan · yruslan · commit 45a5e0161f8d · 2025-11-24T11:17:40.000+01:00
diff --git a/README.md b/README.md
@@ -1602,6 +1602,7 @@ The output looks like this:
 | .option("redefine-segment-id-map:0", "REDEFINED_FIELD1 => SegmentId1,SegmentId2,...") | Specifies a mapping between redefined field names and segment id values. Each option specifies a mapping for a single segment. The numeric value for each mapping option must be incremented so the option keys are unique.                                                                                                                                                        |
 | .option("segment-children:0", "COMPANY => EMPLOYEE,DEPARTMENT")                       | Specifies a mapping between segment redefined fields and their children. Each option specifies a mapping for a single parent field. The numeric value for each mapping option must be incremented so the option keys are unique. If such mapping is specified hierarchical record structure will be automatically reconstructed. This require `redefine-segment-id-map` to be set. | 
 | .option("enable_indexes", "true")                                                     | Turns on indexing of multisegment variable length files (on by default).                                                                                                                                                                                                                                                                                                           |
+| .option("enable_index_cache", "false")                                                | When true, calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once.                                                                                                                                                                                                                            |
 | .option("input_split_records", 50000)                                                 | Specifies how many records will be allocated to each split/partition. It will be processed by Spark tasks. (The default is not set and the split will happen according to size, see the next option)                                                                                                                                                                               |
 | .option("input_split_size_mb", 100)                                                   | Specify how many megabytes to allocate to each partition/split. (The default is 100 MB)                                                                                                                                                                                                                                                                                            |
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -505,7 +505,7 @@ object CobolParametersParser extends Logging {
         fileEndOffset,
         isRecordIdGenerationEnabled,
         params.getOrElse(PARAM_ENABLE_INDEXES, "true").toBoolean,
-        params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "true").toBoolean,
+        params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "false").toBoolean,
         params.get(PARAM_INPUT_SPLIT_RECORDS).map(v => v.toInt),
         params.get(PARAM_INPUT_SPLIT_SIZE_MB).map(v => v.toInt),
         params.getOrElse(PARAM_IMPROVE_LOCALITY, "true").toBoolean,
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test37RecordLengthMappingSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test37RecordLengthMappingSpec.scala
@@ -170,6 +170,31 @@ class Test37RecordLengthMappingSpec extends AnyWordSpec with SparkTestBase with
       }
     }
 
+    "work for data with offsets and indexes and index cache" in {
+      withTempBinFile("record_length_mapping", ".tmp", dataWithFileOffsets) { tempFile =>
+        val expected = """{"SEG_ID":"A","TEXT":"123"},{"SEG_ID":"B","TEXT":"123456"},{"SEG_ID":"C","TEXT":"1234567"}"""
+
+        val df = spark.read
+          .format("cobol")
+          .option("copybook_contents", copybook)
+          .option("record_format", "F")
+          .option("record_length_field", "SEG-ID")
+          .option("file_start_offset", 1)
+          .option("file_end_offset", 2)
+          .option("input_split_records", "2")
+          .option("enable_index_cache", "true")
+          .option("pedantic", "true")
+          .option("record_length_map", """{"A":4,"B":7,"C":8}""")
+          .load(tempFile)
+
+        val actualInitial = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
+        val actualCached = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
+
+        assert(actualInitial == expected)
+        assert(actualCached == expected)
+      }
+    }
+
     "throw an exception for unknown mapping" in {
       withTempBinFile("record_length_mapping", ".tmp", dataSimple) { tempFile =>
         val df = spark.read