#803 Ensure Hadoop file stream opens the file only if it is actually used.

yruslan · yruslan · commit 3738dd251ebc · 2025-11-22T09:25:25.000+01:00
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamer.scala
@@ -23,27 +23,31 @@ import org.apache.hadoop.fs.ContentSummary
 import za.co.absa.cobrix.cobol.reader.common.Constants
 
 /**
-  * This class provides methods for streaming bytes from an Hadoop file.
+  * This class provides methods for streaming bytes from a Hadoop file.
   *
   * It is stateful, which means that it stores the offset until which the file has been consumed.
   *
-  * Instances of this class are not reusable, i.e. once the file is fully read it can neither be reopened nor can other
+  * Instances of this class are not reusable, i.e., once the file is fully read, it can neither be reopened nor can another
   * file be consumed.
   *
-  * @param filePath   String contained the fully qualified path to the file.
-  * @param fileSystem Underlying FileSystem point of access.
-  * @throws IllegalArgumentException in case the file is not found in the underlying file system.
+  * @param filePath   String containing the fully qualified path to the file.
+  * @param fileSystem Underlying Hadoop file system.
+  * @throws IllegalArgumentException if the file is not found in the underlying file system.
   */
 class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long = 0L, maximumBytes: Long = 0L) extends SimpleStream {
 
   private val logger = Logger.getLogger(FileStreamer.this.getClass)
 
   private var byteIndex = startOffset
 
-  // Use a buffer to read the data from Hadoop in big chunks
-  private var bufferedStream = new BufferedFSDataInputStream(new Path(filePath), fileSystem, startOffset, Constants.defaultStreamBufferInMB, maximumBytes)
+  // This ensures that the file is never opened if the stream is never used. This serves two purposes:
+  // - Safety: ensures that unused streams are closed.
+  // - Performance: prevents time being spent on opening unused files.
+  // Note: Since we are working with a network file system, opening a file is a very expensive operation.
+  private var wasOpened = false
+  private var bufferedStream: BufferedFSDataInputStream = _
 
-  private val fileSize = getHadoopFileSize(new Path(filePath))
+  private lazy val fileSize = getHadoopFileSize(new Path(filePath))
 
   override def inputFileName: String = filePath
 
@@ -66,6 +70,7 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
     * @return An array containing the requested bytes, or fewer bytes if end of stream is reached, or empty array if no more data
     */
   override def next(numberOfBytes: Int): Array[Byte] = {
+    ensureOpened()
     val actualBytesToRead = if (maximumBytes > 0) {
       Math.min(maximumBytes - byteIndex + startOffset, numberOfBytes).toInt
     } else {
@@ -106,6 +111,7 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
 
   override def close(): Unit = {
     if (bufferedStream != null && !bufferedStream.isClosed) {
+      wasOpened = true
       bufferedStream.close()
       bufferedStream = null
     }
@@ -114,6 +120,13 @@ class FileStreamer(filePath: String, fileSystem: FileSystem, startOffset: Long =
   override def copyStream(): SimpleStream = {
     new FileStreamer(filePath, fileSystem, startOffset, maximumBytes)
   }
+  
+  private def ensureOpened(): Unit = {
+    if (!wasOpened) {
+      bufferedStream = new BufferedFSDataInputStream(new Path(filePath), fileSystem, startOffset, Constants.defaultStreamBufferInMB, maximumBytes)
+      wasOpened = true
+    }
+  }
 
   private def getHadoopFileSize(hadoopPath: Path): Long = {
     val cSummary: ContentSummary = fileSystem.getContentSummary(hadoopPath)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamerSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/streaming/FileStreamerSpec.scala
@@ -44,10 +44,16 @@ class FileStreamerSpec extends AnyFlatSpec with BeforeAndAfter with Matchers {
 
   it should "throw if file does not exist" in {
     assertThrows[FileNotFoundException] {
-      new FileStreamer(new File(TEMP_DIR, "inexistent").getAbsolutePath, FileSystem.get(new Configuration()))
+      val stream = new FileStreamer(new File(TEMP_DIR, "inexistent").getAbsolutePath, FileSystem.get(new Configuration()))
+      stream.size
     }
   }
 
+  it should "not throw if the stream is never used, even if the file does not exist" in {
+    noException should be thrownBy {
+      new FileStreamer(new File(TEMP_DIR, "inexistent").getAbsolutePath, FileSystem.get(new Configuration()))
+    }
+  }
   it should "return array of same length than expected number of bytes if enough data" in {
     val batchLength = 8
     val iterations = 10