#809 Add support for file end offset for compressed files.

yruslan · yruslan · commit 6ded13484eff · 2025-12-19T13:40:58.000+01:00
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -65,7 +65,7 @@ class DefaultSource
     val hasCompressedFiles = filesList.exists(_.isCompressed)
 
     if (hasCompressedFiles) {
-      logger.info(s"Compressed files found. Binary parallelism and indexes won't be used for them.")
+      logger.info(s"Compressed files found. Binary parallelism and indexes will be adjusted accordingly.")
     }
 
     new CobolRelation(cobolParameters.sourcePaths,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala
@@ -32,7 +32,7 @@ import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
 import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
 import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
 import za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder
-import za.co.absa.cobrix.spark.cobol.utils.{HDFSUtils, SparkUtils}
+import za.co.absa.cobrix.spark.cobol.utils.{FileUtils, HDFSUtils, SparkUtils}
 
 import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable.ArrayBuffer
@@ -231,7 +231,13 @@ private[cobol] object IndexBuilder extends Logging {
     val maximumBytes = if (fileEndOffset == 0) {
       0
     } else {
-      val bytesToRead = fileSystem.getContentSummary(path).getLength - fileEndOffset - startOffset
+      val fileSize = if (FileUtils.isCompressed(path, config)) {
+        FileUtils.getCompressedFileSize(path,config)
+      } else {
+        fileSystem.getFileStatus(path).getLen
+      }
+
+      val bytesToRead = fileSize - fileEndOffset - startOffset
       if (bytesToRead < 0)
         0
       else
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/scanners/CobolScanners.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/scanners/CobolScanners.scala
@@ -18,7 +18,6 @@ package za.co.absa.cobrix.spark.cobol.source.scanners
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.io.compress.CompressionCodecFactory
 import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.spark.rdd.RDD
@@ -73,17 +72,13 @@ private[source] object CobolScanners extends Logging {
           val maximumFileBytes = if (reader.getReaderProperties.fileEndOffset == 0) {
             0
           } else {
-            if (isCompressed(path, sconf.value)) {
-              // ToDo determine if the uncompressed file size can be effectively fetched
-              if (reader.getReaderProperties.fileEndOffset > 0) {
-                logger.warn(s"File end offset for $path is ignored because the file is compressed.")
-              }
-              0L
+            val fileSize = if (FileUtils.isCompressed(path, sconf.value)) {
+              FileUtils.getCompressedFileSize(path, sconf.value)
             } else {
-              val fileSize = fileSystem.getFileStatus(path).getLen
-
-              fileSize - reader.getReaderProperties.fileEndOffset - startFileOffset
+              fileSystem.getFileStatus(path).getLen
             }
+
+            fileSize - reader.getReaderProperties.fileEndOffset - startFileOffset
           }
           val dataStream = new FileStreamer(filePath, sconf.value, startFileOffset, maximumFileBytes)
           val headerStream = new FileStreamer(filePath, sconf.value, startFileOffset)
@@ -92,13 +87,6 @@ private[source] object CobolScanners extends Logging {
       })
   }
 
-  private[source] def isCompressed(file: Path, hadoopConfig: Configuration): Boolean = {
-    val factory = new CompressionCodecFactory(hadoopConfig)
-    val codec = factory.getCodec(file)
-
-    codec != null
-  }
-
   private[source] def buildScanForFixedLength(reader: FixedLenReader, sourceDirs: Seq[String],
                                               recordParser: (FixedLenReader, RDD[Array[Byte]]) => RDD[Row],
                                               debugIgnoreFileSize: Boolean,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/FileUtils.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/FileUtils.scala
@@ -16,13 +16,14 @@
 
 package za.co.absa.cobrix.spark.cobol.utils
 
-import java.io.{FileOutputStream, OutputStreamWriter, PrintWriter}
-import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Paths}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
+import org.apache.hadoop.io.compress.CompressionCodecFactory
 import za.co.absa.cobrix.cobol.internal.Logging
 
+import java.io.{FileOutputStream, IOException, OutputStreamWriter, PrintWriter}
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 import scala.collection.JavaConverters._
 
 /**
@@ -33,7 +34,6 @@ import scala.collection.JavaConverters._
   * Applies the same filter as Hadoop's FileInputFormat, which excludes files starting with '.' or '_'.
   */
 object FileUtils extends Logging {
-
   val THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK = 50
 
   private val hiddenFileFilter = new PathFilter() {
@@ -216,6 +216,42 @@ object FileUtils extends Logging {
     allNonDivisibleFiles.map(status => (status.getPath.toString, status.getLen))
   }
 
+  def isCompressed(file: Path, hadoopConfig: Configuration): Boolean = {
+    val factory = new CompressionCodecFactory(hadoopConfig)
+    val codec = factory.getCodec(file)
+
+    codec != null
+  }
+
+  def getCompressedFileSize(file: Path, hadoopConfig: Configuration): Long = {
+    logger.warn(s"Using full scan to determine file size of $file..")
+    val factory = new CompressionCodecFactory(hadoopConfig)
+    val codec = factory.getCodec(file)
+    val fileSystem = file.getFileSystem(hadoopConfig)
+    val fsIn: FSDataInputStream = fileSystem.open(file)
+    val ifs = codec.createInputStream(fsIn)
+
+    val size = try {
+      val SKIP_BUFFER_SIZE = 1024*1024*50
+      var totalBytesSkipped = 0L
+      var skippedLast = 1L
+      while (skippedLast > 0) {
+        skippedLast = ifs.skip(SKIP_BUFFER_SIZE)
+        if (skippedLast > 0)
+          totalBytesSkipped += skippedLast
+      }
+      totalBytesSkipped
+    } catch {
+      case e: IOException =>
+        throw new IOException(s"Unable to determine compressed file size for $file", e)
+    } finally {
+      ifs.close()
+      fsIn.close()
+    }
+    logger.info(s"The size of the uncompressed file $file is $size bytes.")
+    size
+  }
+
   private def isNonDivisible(fileStatus: FileStatus, divisor: Long) = fileStatus.getLen % divisor != 0
 
   /**
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test40CompressesFilesSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test40CompressesFilesSpec.scala
@@ -184,6 +184,23 @@ class Test40CompressesFilesSpec extends AnyFunSuite with SparkTestBase with Bina
     assert(df.count == 300)
   }
 
+  test("read mixed compressed EBCDIC files and file_end_offset") {
+    val inputDataPath = "../data/test40_data"
+
+    val df = spark
+      .read
+      .format("cobol")
+      .option("copybook", inputCopybookPath)
+      .option("schema_retention_policy", "collapse_root")
+      .option("floating_point_format", "IEEE754")
+      .option("strict_sign_overpunching", "true")
+      .option("file_end_offset", 1493)
+      .option("pedantic", "true")
+      .load(inputDataPath)
+
+    assert(df.count == 297)
+  }
+
   test("read a compressed ASCII file 1") {
     testCompressedAsciiFile(Map(
       "record_format" -> "D"

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ class DefaultSource`
`65`	`65`	`val hasCompressedFiles = filesList.exists(_.isCompressed)`
`66`	`66`
`67`	`67`	`if (hasCompressedFiles) {`
`68`		`- logger.info(s"Compressed files found. Binary parallelism and indexes won't be used for them.")`
	`68`	`+ logger.info(s"Compressed files found. Binary parallelism and indexes will be adjusted accordingly.")`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`new CobolRelation(cobolParameters.sourcePaths,`