@@ -18,6 +18,7 @@ package za.co.absa.cobrix.spark.cobol.source.scanners
1818
1919import org .apache .hadoop .conf .Configuration
2020import org .apache .hadoop .fs .Path
21+ import org .apache .hadoop .io .compress .CompressionCodecFactory
2122import org .apache .hadoop .io .{LongWritable , Text }
2223import org .apache .hadoop .mapred .TextInputFormat
2324import org .apache .spark .rdd .RDD
@@ -72,7 +73,17 @@ private[source] object CobolScanners extends Logging {
7273 val maximumFileBytes = if (reader.getReaderProperties.fileEndOffset == 0 ) {
7374 0
7475 } else {
75- fileSystem.getFileStatus(path).getLen - reader.getReaderProperties.fileEndOffset - startFileOffset
76+ if (isCompressed(path, sconf.value)) {
77+ // ToDo determine if the uncompressed file size can be effectively fetched
78+ if (reader.getReaderProperties.fileEndOffset > 0 ) {
79+ logger.warn(s " File end offset for $path is ignored because the file is compressed. " )
80+ }
81+ 0L
82+ } else {
83+ val fileSize = fileSystem.getFileStatus(path).getLen
84+
85+ fileSize - reader.getReaderProperties.fileEndOffset - startFileOffset
86+ }
7687 }
7788 val dataStream = new FileStreamer (filePath, sconf.value, startFileOffset, maximumFileBytes)
7889 val headerStream = new FileStreamer (filePath, sconf.value, startFileOffset)
@@ -81,6 +92,13 @@ private[source] object CobolScanners extends Logging {
8192 })
8293 }
8394
95+ private [source] def isCompressed (file : Path , hadoopConfig : Configuration ): Boolean = {
96+ val factory = new CompressionCodecFactory (hadoopConfig)
97+ val codec = factory.getCodec(file)
98+
99+ codec != null
100+ }
101+
84102 private [source] def buildScanForFixedLength (reader : FixedLenReader , sourceDirs : Seq [String ],
85103 recordParser : (FixedLenReader , RDD [Array [Byte ]]) => RDD [Row ],
86104 debugIgnoreFileSize : Boolean ,
0 commit comments