AbsaOSS
diff --git a/‎cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/stream/SimpleStream.scala‎
Lines changed: 2 additions & 0 deletions b/‎cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/stream/SimpleStream.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala‎
Lines changed: 4 additions & 4 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala‎
Lines changed: 15 additions & 16 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/CobolRelation.scala‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala‎
Lines changed: 21 additions & 3 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala‎
Lines changed: 8 additions & 3 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/scanners/CobolScanners.scala‎
Lines changed: 4 additions & 5 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/scanners/CobolScanners.scala‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/BufferedFSDataInputStream.scala‎
Lines changed: 28 additions & 7 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/BufferedFSDataInputStream.scala‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala‎
Lines changed: 13 additions & 7 deletions b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/types/FileWithOrder.scala‎
Lines changed: 1 addition & 1 deletion b/‎spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/types/FileWithOrder.scala‎
Lines changed: 1 addition & 1 deletion
@@ -29,6 +29,8 @@ trait SimpleStream {
 
   def isEndOfStream: Boolean = offset >= size
 
+  def isCompressed: Boolean = false
+
   @throws(classOf[Exception])
   def copyStream(): SimpleStream
 
 
@@ -229,8 +229,8 @@ object SparkCobolProcessor {
           val numOfBytesMsg = if (numOfBytes > 0) s"${numOfBytes / Constants.megabyte} MB" else "until the end"
 
           log.info(s"Going to process offsets ${indexEntry.offsetFrom}...${indexEntry.offsetTo} ($numOfBytesMsg) of $fileName")
-          val dataStream = new FileStreamer(filePathName, fileSystem, indexEntry.offsetFrom, numOfBytes)
-          val headerStream = new FileStreamer(filePathName, fileSystem)
+          val dataStream = new FileStreamer(filePathName, sconf.value, indexEntry.offsetFrom, numOfBytes)
+          val headerStream = new FileStreamer(filePathName, sconf.value)
 
           CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, dataStream, Some(headerStream))
         })
@@ -240,7 +240,7 @@ object SparkCobolProcessor {
           val hadoopConfig = sconf.value
           log.info(s"Going to process data from $inputFile")
           val inputFs = new Path(inputFile).getFileSystem(hadoopConfig)
-          val ifs = new FileStreamer(inputFile, inputFs)
+          val ifs = new FileStreamer(inputFile, sconf.value)
 
           CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, ifs, None)
         }
@@ -266,7 +266,7 @@ object SparkCobolProcessor {
       Future {
         val hadoopConfig = sconf.value
         val inputFs = new Path(inputFIle).getFileSystem(hadoopConfig)
-        val ifs = new FileStreamer(inputFIle, inputFs)
+        val ifs = new FileStreamer(inputFIle, sconf.value)
         val outputFile = new Path(outputPath, fileName)
         val outputFs = outputFile.getFileSystem(hadoopConfig)
         val ofs = new BufferedOutputStream(outputFs.create(outputFile, true))
 
@@ -16,22 +16,22 @@
 
 package za.co.absa.cobrix.spark.cobol.source
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
-
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapred.FileInputFormat
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.compress.CompressionCodecFactory
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.sources.{BaseRelation, TableScan}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Row, SQLContext}
-import za.co.absa.cobrix.spark.cobol.reader.{FixedLenReader, FixedLenTextReader, Reader, VarLenReader}
 import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
+import za.co.absa.cobrix.spark.cobol.reader.{FixedLenReader, FixedLenTextReader, Reader, VarLenReader}
 import za.co.absa.cobrix.spark.cobol.source.index.IndexBuilder
 import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
 import za.co.absa.cobrix.spark.cobol.source.scanners.CobolScanners
 import za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder
 import za.co.absa.cobrix.spark.cobol.utils.FileUtils
 
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 import scala.util.control.NonFatal
 
 
@@ -63,6 +63,7 @@ class SerializableConfiguration(@transient var value: Configuration) extends Ser
   * Its constructor is expected to change after the hierarchy of [[za.co.absa.cobrix.spark.cobol.reader.Reader]] is put in place.
   */
 class CobolRelation(sourceDirs: Seq[String],
+                    filesList: Array[FileWithOrder],
                     cobolReader: Reader,
                     localityParams: LocalityParameters,
                     debugIgnoreFileSize: Boolean)
@@ -71,8 +72,6 @@ class CobolRelation(sourceDirs: Seq[String],
     with Serializable
     with TableScan {
 
-  private val filesList = CobolRelation.getListFilesWithOrder(sourceDirs, sqlContext, isRecursiveRetrieval)
-
   private lazy val indexes: RDD[SparseIndexEntry] = IndexBuilder.buildIndex(filesList, cobolReader, sqlContext, cobolReader.getReaderProperties.isIndexCachingAllowed)(localityParams)
 
   override def schema: StructType = {
@@ -94,15 +93,7 @@ class CobolRelation(sourceDirs: Seq[String],
     }
   }
 
-  /**
-    * Checks if the recursive file retrieval flag is set
-    */
-  private def isRecursiveRetrieval: Boolean = {
-    val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
-    hadoopConf.getBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, false)
-  }
-
-  private[source] def parseRecords(reader: FixedLenReader, records: RDD[Array[Byte]]) = {
+  private[source] def parseRecords(reader: FixedLenReader, records: RDD[Array[Byte]]): RDD[Row] = {
     records.flatMap(record => {
       val it = reader.getRowIterator(record)
       for (parsedRecord <- it) yield {
@@ -125,8 +116,16 @@ object CobolRelation {
         .getFiles(sourceDir, sqlContext.sparkContext.hadoopConfiguration, isRecursiveRetrieval)
     }).toArray
 
+    val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
+    val factory = new CompressionCodecFactory(hadoopConf)
+
     allFiles
       .zipWithIndex
-      .map(file => FileWithOrder(file._1, file._2))
+      .map { case (fileName, order) =>
+        val codec = factory.getCodec(new Path(fileName))
+        val isCompressed = codec != null
+
+        FileWithOrder(fileName, order, isCompressed)
+      }
   }
 }
@@ -18,6 +18,7 @@ package za.co.absa.cobrix.spark.cobol.source
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{BytesWritable, NullWritable}
+import org.apache.hadoop.mapred.FileInputFormat
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
@@ -59,8 +60,17 @@ class DefaultSource
     val cobolParameters = CobolParametersParser.parse(new Parameters(parameters))
     CobolParametersValidator.checkSanity(cobolParameters)
 
+    val filesList = CobolRelation.getListFilesWithOrder(cobolParameters.sourcePaths, sqlContext, isRecursiveRetrieval(sqlContext))
+
+    val hasCompressedFiles = filesList.exists(_.isCompressed)
+
+    if (hasCompressedFiles) {
+      logger.info(s"Compressed files found. Binary parallelism and indexes won't be used for them.")
+    }
+
     new CobolRelation(cobolParameters.sourcePaths,
-      buildEitherReader(sqlContext.sparkSession, cobolParameters),
+      filesList,
+      buildEitherReader(sqlContext.sparkSession, cobolParameters, hasCompressedFiles),
       LocalityParameters.extract(cobolParameters),
       cobolParameters.debugIgnoreFileSize)(sqlContext)
   }
@@ -126,6 +136,14 @@ class DefaultSource
 
   //TODO fix with the correct implementation once the correct Reader hierarchy is put in place.
   override def buildReader(spark: SparkSession, parameters: Map[String, String]): FixedLenReader = null
+
+  /**
+    * Checks if the recursive file retrieval flag is set
+    */
+  private def isRecursiveRetrieval(sqlContext: SQLContext): Boolean = {
+    val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
+    hadoopConf.getBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, false)
+  }
 }
 
 object DefaultSource {
@@ -136,10 +154,10 @@ object DefaultSource {
     *
     * This method will probably be removed once the correct hierarchy for [[FixedLenReader]] is put in place.
     */
-  def buildEitherReader(spark: SparkSession, cobolParameters: CobolParameters): Reader = {
+  def buildEitherReader(spark: SparkSession, cobolParameters: CobolParameters, hasCompressedFiles: Boolean): Reader = {
     val reader = if (cobolParameters.isText && cobolParameters.variableLengthParams.isEmpty) {
       createTextReader(cobolParameters, spark)
-    } else if (cobolParameters.variableLengthParams.isEmpty) {
+    } else if (cobolParameters.variableLengthParams.isEmpty && !hasCompressedFiles) {
       createFixedLengthReader(cobolParameters, spark)
     }
     else {
 
@@ -205,7 +205,12 @@ private[cobol] object IndexBuilder extends Logging {
 
     val (inputStream, headerStream, maximumBytes) = getStreams(filePath, startOffset, endOffset, config)
     val index = try {
-      reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
+      if (inputStream.isCompressed) {
+        val element = SparseIndexEntry(0, -1, fileOrder, 0L)
+        ArrayBuffer[SparseIndexEntry](element)
+      } else {
+        reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
+      }
     } finally {
       inputStream.close()
       headerStream.close()
@@ -238,8 +243,8 @@ private[cobol] object IndexBuilder extends Logging {
         bytesToRead
     }
 
-    val inputStream = new FileStreamer(filePath, fileSystem, startOffset, maximumBytes)
-    val headerStream = new FileStreamer(filePath, fileSystem)
+    val inputStream = new FileStreamer(filePath, config, startOffset, maximumBytes)
+    val headerStream = new FileStreamer(filePath, config)
 
     (inputStream, headerStream, maximumBytes)
   }
 
@@ -43,14 +43,13 @@ private[source] object CobolScanners extends Logging {
     indexes.flatMap(indexEntry => {
       val filePathName = filesMap(indexEntry.fileId)
       val path = new Path(filePathName)
-      val fileSystem = path.getFileSystem(sconf.value)
       val fileName = path.getName
       val numOfBytes = if (indexEntry.offsetTo > 0L) indexEntry.offsetTo - indexEntry.offsetFrom else 0L
       val numOfBytesMsg = if (numOfBytes > 0) s"${numOfBytes / Constants.megabyte} MB" else "until the end"
 
       logger.info(s"Going to process offsets ${indexEntry.offsetFrom}...${indexEntry.offsetTo} ($numOfBytesMsg) of $fileName")
-      val dataStream = new FileStreamer(filePathName, fileSystem, indexEntry.offsetFrom, numOfBytes)
-      val headerStream = new FileStreamer(filePathName, fileSystem)
+      val dataStream = new FileStreamer(filePathName, sconf.value, indexEntry.offsetFrom, numOfBytes)
+      val headerStream = new FileStreamer(filePathName, sconf.value)
       reader.getRowIterator(dataStream, headerStream, indexEntry.offsetFrom, indexEntry.fileId, indexEntry.recordIndex)
     })
   }
@@ -75,8 +74,8 @@ private[source] object CobolScanners extends Logging {
           } else {
             fileSystem.getFileStatus(path).getLen - reader.getReaderProperties.fileEndOffset - startFileOffset
           }
-          val dataStream = new FileStreamer(filePath, fileSystem, startFileOffset, maximumFileBytes)
-          val headerStream = new FileStreamer(filePath, fileSystem, startFileOffset)
+          val dataStream = new FileStreamer(filePath, sconf.value, startFileOffset, maximumFileBytes)
+          val headerStream = new FileStreamer(filePath, sconf.value, startFileOffset)
           reader.getRowIterator(dataStream, headerStream, startFileOffset, fileOrder, 0L)
         })
       })
 
@@ -16,21 +16,21 @@
 
 package za.co.absa.cobrix.spark.cobol.source.streaming
 
-import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataInputStream, Path}
+import org.apache.hadoop.io.compress.CompressionCodecFactory
 
-import java.io.IOException
+import java.io.{IOException, InputStream}
 
-class BufferedFSDataInputStream(filePath: Path, fileSystem: FileSystem, startOffset: Long, bufferSizeInMegabytes: Int, maximumBytes: Long ) {
+class BufferedFSDataInputStream(filePath: Path, hadoopConfig: Configuration, startOffset: Long, bufferSizeInMegabytes: Int, maximumBytes: Long ) {
   val bytesInMegabyte: Int = 1048576
+  private var isCompressedStream = false
 
   if (bufferSizeInMegabytes <=0 || bufferSizeInMegabytes > 1000) {
     throw new IllegalArgumentException(s"Invalid buffer size $bufferSizeInMegabytes MB.")
   }
 
-  var in: FSDataInputStream = fileSystem.open(filePath)
-  if (startOffset > 0) {
-    in.seek(startOffset)
-  }
+  private var in: InputStream = openStream()
 
   private val bufferSizeInBytes = bufferSizeInMegabytes * bytesInMegabyte
   private var isStreamClosed = in == null
@@ -51,6 +51,8 @@ class BufferedFSDataInputStream(filePath: Path, fileSystem: FileSystem, startOff
 
   def isClosed: Boolean = isStreamClosed && bufferPos >= bufferConitainBytes
 
+  def isCompressed: Boolean = isCompressedStream
+
   def readFully(b: Array[Byte], off: Int, len: Int): Int =
   {
     if (isClosed) {
@@ -115,4 +117,23 @@ class BufferedFSDataInputStream(filePath: Path, fileSystem: FileSystem, startOff
     }
   }
 
+  private def openStream(): InputStream = {
+    val fileSystem = filePath.getFileSystem(hadoopConfig)
+    val fsIn: FSDataInputStream = fileSystem.open(filePath)
+
+    if (startOffset > 0) {
+      fsIn.seek(startOffset)
+    }
+
+    val factory = new CompressionCodecFactory(hadoopConfig)
+    val codec = factory.getCodec(filePath)
+
+    if (codec != null) {
+      isCompressedStream = true
+      codec.createInputStream(fsIn)
+    } else {
+      // No compression detected
+      fsIn
+    }
+  }
 }
@@ -16,11 +16,11 @@
 
 package za.co.absa.cobrix.spark.cobol.source.streaming
 
-import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{ContentSummary, Path}
 import org.apache.log4j.Logger
-import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
-import org.apache.hadoop.fs.ContentSummary
 import za.co.absa.cobrix.cobol.reader.common.Constants
+import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
 
 import java.io.IOException
 
@@ -33,10 +33,10 @@ import java.io.IOException
   * file be consumed.
   *
   * @param filePath   String containing the fully qualified path to the file.
-  * @param fileSystem Underlying Hadoop file system.
+  * @param hadoopConfig Hadoop configuration.
   * @note This class is not thread-safe and should only be accessed from a single thread
   */
-class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long = 0L, maximumBytes: Long = 0L) extends SimpleStream {
+class FileStreamer(filePath: String, hadoopConfig: Configuration, startOffset: Long = 0L, maximumBytes: Long = 0L) extends SimpleStream {
 
   private val logger = Logger.getLogger(FileStreamer.this.getClass)
 
@@ -59,6 +59,11 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
 
   override def offset: Long = byteIndex
 
+  override def isCompressed: Boolean = {
+    ensureOpened()
+    bufferedStream.isCompressed
+  }
+
   /**
     * Retrieves a given number of bytes from the file stream.
     *
@@ -123,18 +128,19 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
   }
 
   override def copyStream(): SimpleStream = {
-    new FileStreamer(filePath, fileSystem, startOffset, maximumBytes)
+    new FileStreamer(filePath, hadoopConfig, startOffset, maximumBytes)
   }
 
   @throws[IOException]
   private def ensureOpened(): Unit = {
     if (!wasOpened) {
-      bufferedStream = new BufferedFSDataInputStream(new Path(filePath), fileSystem, startOffset, Constants.defaultStreamBufferInMB, maximumBytes)
+      bufferedStream = new BufferedFSDataInputStream(new Path(filePath), hadoopConfig, startOffset, Constants.defaultStreamBufferInMB, maximumBytes)
       wasOpened = true
     }
   }
 
   private def getHadoopFileSize(hadoopPath: Path): Long = {
+    val fileSystem = hadoopPath.getFileSystem(hadoopConfig)
     val cSummary: ContentSummary = fileSystem.getContentSummary(hadoopPath)
     cSummary.getLength
   }
 
@@ -19,4 +19,4 @@ package za.co.absa.cobrix.spark.cobol.source.types
 /**
   * Represents a file attached to an order.
   */
-private[source] case class FileWithOrder(filePath: String, order: Int)
+private[source] case class FileWithOrder(filePath: String, order: Int, isCompressed: Boolean)