address comments

thirtiseven · thirtiseven · commit 9b1162e7be7e · 2026-01-06T15:10:58.000+08:00
Signed-off-by: Haoyang Li &lt;haoyangl@nvidia.com&gt;
diff --git a/sql-plugin/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql-plugin/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -0,0 +1,2 @@
+com.nvidia.spark.rapids.SequenceFileBinaryFileFormat
+
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/sequencefile/GpuSequenceFileReaders.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/sequencefile/GpuSequenceFileReaders.scala
@@ -71,7 +71,8 @@ private[sequencefile] final class HostBinaryListBufferer(
 
   private def growOffsetsIfNeeded(): Unit = {
     if (numRows + 1 > rowsAllocated) {
-      val newRowsAllocated = math.min(rowsAllocated.toLong * 2, Int.MaxValue.toLong - 1L).toInt
+      // Use Int.MaxValue - 2 to ensure (rowsAllocated + 1) * 4 doesn't overflow
+      val newRowsAllocated = math.min(rowsAllocated.toLong * 2, Int.MaxValue.toLong - 2L).toInt
       val newSize = (newRowsAllocated.toLong + 1L) * DType.INT32.getSizeInBytes
       closeOnExcept(HostMemoryBuffer.allocate(newSize)) { tmpBuffer =>
         tmpBuffer.copyFromHostBuffer(0, offsetsBuffer, 0, offsetsBuffer.getLength)
@@ -89,6 +90,8 @@ private[sequencefile] final class HostBinaryListBufferer(
         newBuff.copyFromHostBuffer(0, dataBuffer, 0, dataLocation)
         dataBuffer.close()
         dataBuffer = newBuff
+        // Clear old stream wrapper before creating new ones
+        dos = null
         out = new HostMemoryOutputStream(dataBuffer)
         dos = new DataOutputStream(out)
       }
@@ -123,7 +126,13 @@ private[sequencefile] final class HostBinaryListBufferer(
     val offsetPosition = numRows.toLong * DType.INT32.getSizeInBytes
     val startDataLocation = dataLocation
     out.seek(dataLocation)
+    val startPos = out.getPos
     valueBytes.writeUncompressedBytes(dos)
+    val actualLen = (out.getPos - startPos).toInt
+    if (actualLen != len) {
+      throw new IllegalStateException(
+        s"addValueBytes length mismatch: expected $len bytes, but wrote $actualLen bytes")
+    }
     dataLocation = out.getPos
     // Write offset only after successful data write
     offsetsBuffer.setInt(offsetPosition, startDataLocation.toInt)
@@ -534,23 +543,26 @@ case class GpuSequenceFileMultiFilePartitionReaderFactory(
 
   override protected def getFileFormatShortName: String = "SequenceFileBinary"
 
-  override protected def buildBaseColumnarReaderForCloud(
+  private def buildSequenceFileMultiFileReader(
       files: Array[PartitionedFile],
       conf: Configuration): PartitionReader[ColumnarBatch] = {
-    // No special cloud implementation yet; read sequentially on the task thread.
     new PartitionReaderWithBytesRead(
       new SequenceFileMultiFilePartitionReader(conf, files, readDataSchema, partitionSchema,
         maxReadBatchSizeRows, maxReadBatchSizeBytes, maxGpuColumnSizeBytes,
         metrics, queryUsesInputFile))
   }
 
+  override protected def buildBaseColumnarReaderForCloud(
+      files: Array[PartitionedFile],
+      conf: Configuration): PartitionReader[ColumnarBatch] = {
+    // No special cloud implementation yet; read sequentially on the task thread.
+    buildSequenceFileMultiFileReader(files, conf)
+  }
+
   override protected def buildBaseColumnarReaderForCoalescing(
       files: Array[PartitionedFile],
       conf: Configuration): PartitionReader[ColumnarBatch] = {
     // Sequential multi-file reader (no cross-file coalescing).
-    new PartitionReaderWithBytesRead(
-      new SequenceFileMultiFilePartitionReader(conf, files, readDataSchema, partitionSchema,
-        maxReadBatchSizeRows, maxReadBatchSizeBytes, maxGpuColumnSizeBytes,
-        metrics, queryUsesInputFile))
+    buildSequenceFileMultiFileReader(files, conf)
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SequenceFileBinaryFileFormatSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SequenceFileBinaryFileFormatSuite.scala
@@ -32,6 +32,15 @@ import org.scalatest.funsuite.AnyFunSuite
 import org.apache.spark.SparkException
 import org.apache.spark.sql.SparkSession
 
+/**
+ * Unit tests for SequenceFileBinaryFileFormat.
+ *
+ * Note: This test suite uses its own withSparkSession/withGpuSparkSession methods instead of
+ * extending SparkQueryCompareTestSuite because:
+ * 1. These tests need fresh SparkSession instances per test to avoid state pollution
+ * 2. The tests don't need the compare-CPU-vs-GPU pattern from SparkQueryCompareTestSuite
+ * 3. The simpler session management makes the tests more self-contained
+ */
 class SequenceFileBinaryFileFormatSuite extends AnyFunSuite {
 
   private def withSparkSession(f: SparkSession => Unit): Unit = {
@@ -56,7 +65,7 @@ class SequenceFileBinaryFileFormatSuite extends AnyFunSuite {
       .config("spark.sql.shuffle.partitions", "1")
       .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
       .config("spark.rapids.sql.enabled", "true")
-      .config("spark.rapids.sql.test.enabled", "false")
+      .config("spark.rapids.sql.test.enabled", "true")
       .getOrCreate()
     try {
       f(spark)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+com.nvidia.spark.rapids.SequenceFileBinaryFileFormat`
	`2`	`+`