Skip to content

Commit 45a5e01

Browse files
committed
#805 Make index caching disabled by default for now. Add the info about the option to README.
1 parent 0942293 commit 45a5e01

File tree

3 files changed

+27
-1
lines changed

3 files changed

+27
-1
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,6 +1602,7 @@ The output looks like this:
16021602
| .option("redefine-segment-id-map:0", "REDEFINED_FIELD1 => SegmentId1,SegmentId2,...") | Specifies a mapping between redefined field names and segment id values. Each option specifies a mapping for a single segment. The numeric value for each mapping option must be incremented so the option keys are unique. |
16031603
| .option("segment-children:0", "COMPANY => EMPLOYEE,DEPARTMENT") | Specifies a mapping between segment redefined fields and their children. Each option specifies a mapping for a single parent field. The numeric value for each mapping option must be incremented so the option keys are unique. If such mapping is specified hierarchical record structure will be automatically reconstructed. This require `redefine-segment-id-map` to be set. |
16041604
| .option("enable_indexes", "true") | Turns on indexing of multisegment variable length files (on by default). |
1605+
| .option("enable_index_cache", "false") | When true, calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once. |
16051606
| .option("input_split_records", 50000) | Specifies how many records will be allocated to each split/partition. It will be processed by Spark tasks. (The default is not set and the split will happen according to size, see the next option) |
16061607
| .option("input_split_size_mb", 100) | Specify how many megabytes to allocate to each partition/split. (The default is 100 MB) |
16071608

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ object CobolParametersParser extends Logging {
505505
fileEndOffset,
506506
isRecordIdGenerationEnabled,
507507
params.getOrElse(PARAM_ENABLE_INDEXES, "true").toBoolean,
508-
params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "true").toBoolean,
508+
params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "false").toBoolean,
509509
params.get(PARAM_INPUT_SPLIT_RECORDS).map(v => v.toInt),
510510
params.get(PARAM_INPUT_SPLIT_SIZE_MB).map(v => v.toInt),
511511
params.getOrElse(PARAM_IMPROVE_LOCALITY, "true").toBoolean,

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test37RecordLengthMappingSpec.scala

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,31 @@ class Test37RecordLengthMappingSpec extends AnyWordSpec with SparkTestBase with
170170
}
171171
}
172172

173+
"work for data with offsets and indexes and index cache" in {
174+
withTempBinFile("record_length_mapping", ".tmp", dataWithFileOffsets) { tempFile =>
175+
val expected = """{"SEG_ID":"A","TEXT":"123"},{"SEG_ID":"B","TEXT":"123456"},{"SEG_ID":"C","TEXT":"1234567"}"""
176+
177+
val df = spark.read
178+
.format("cobol")
179+
.option("copybook_contents", copybook)
180+
.option("record_format", "F")
181+
.option("record_length_field", "SEG-ID")
182+
.option("file_start_offset", 1)
183+
.option("file_end_offset", 2)
184+
.option("input_split_records", "2")
185+
.option("enable_index_cache", "true")
186+
.option("pedantic", "true")
187+
.option("record_length_map", """{"A":4,"B":7,"C":8}""")
188+
.load(tempFile)
189+
190+
val actualInitial = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
191+
val actualCached = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
192+
193+
assert(actualInitial == expected)
194+
assert(actualCached == expected)
195+
}
196+
}
197+
173198
"throw an exception for unknown mapping" in {
174199
withTempBinFile("record_length_mapping", ".tmp", dataSimple) { tempFile =>
175200
val df = spark.read

0 commit comments

Comments
 (0)