address comments

thirtiseven · thirtiseven · commit 572c0da0a40f · 2026-01-06T14:21:22.000+08:00
Signed-off-by: Haoyang Li &lt;haoyangl@nvidia.com&gt;
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/sequencefile/GpuSequenceFileReaders.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/sequencefile/GpuSequenceFileReaders.scala
@@ -96,23 +96,37 @@ private[sequencefile] final class HostBinaryListBufferer(
   }
 
   def addBytes(bytes: Array[Byte], offset: Int, len: Int): Unit = {
+    val newEnd = dataLocation + len
+    if (newEnd > Int.MaxValue) {
+      throw new IllegalStateException(
+        s"Binary column child size $newEnd would exceed INT32 offset limit")
+    }
     growOffsetsIfNeeded()
-    val end = dataLocation + len
-    growDataIfNeeded(end)
-    offsetsBuffer.setInt(numRows.toLong * DType.INT32.getSizeInBytes, dataLocation.toInt)
+    growDataIfNeeded(newEnd)
+    val offsetPosition = numRows.toLong * DType.INT32.getSizeInBytes
+    val startDataLocation = dataLocation
     dataBuffer.setBytes(dataLocation, bytes, offset, len)
-    dataLocation = end
+    dataLocation = newEnd
+    // Write offset only after successful data write
+    offsetsBuffer.setInt(offsetPosition, startDataLocation.toInt)
     numRows += 1
   }
 
   def addValueBytes(valueBytes: SequenceFile.ValueBytes, len: Int): Unit = {
+    val newEnd = dataLocation + len
+    if (newEnd > Int.MaxValue) {
+      throw new IllegalStateException(
+        s"Binary column child size $newEnd would exceed INT32 offset limit")
+    }
     growOffsetsIfNeeded()
-    val end = dataLocation + len
-    growDataIfNeeded(end)
-    offsetsBuffer.setInt(numRows.toLong * DType.INT32.getSizeInBytes, dataLocation.toInt)
+    growDataIfNeeded(newEnd)
+    val offsetPosition = numRows.toLong * DType.INT32.getSizeInBytes
+    val startDataLocation = dataLocation
     out.seek(dataLocation)
     valueBytes.writeUncompressedBytes(dos)
     dataLocation = out.getPos
+    // Write offset only after successful data write
+    offsetsBuffer.setInt(offsetPosition, startDataLocation.toInt)
     numRows += 1
   }
 
@@ -149,6 +163,9 @@ private[sequencefile] final class HostBinaryListBufferer(
       }
     }
     offsetsBuffer = null
+    // The stream wrappers (out, dos) don't hold independent resources - they just wrap the
+    // dataBuffer which is now owned by childHost. Setting to null without close() is intentional
+    // to avoid attempting operations on the transferred buffer.
     out = null
     dos = null
 
@@ -327,7 +344,7 @@ class SequenceFilePartitionReader(
               val recBytes = recordBytes(keyLen, valueLen)
 
               // If this record doesn't fit, keep it for the next batch (unless it's the first row)
-              if (rows > 0 && recBytes > 0 && bytes + recBytes > maxBytesPerBatch) {
+              if (rows > 0 && bytes + recBytes > maxBytesPerBatch) {
                 pending = Some(makePending(keyLen, valueLen))
                 keepReading = false
               } else {
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SequenceFileBinaryFileFormatSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SequenceFileBinaryFileFormatSuite.scala
@@ -463,4 +463,36 @@ class SequenceFileBinaryFileFormatSuite extends AnyFunSuite {
       }
     }
   }
+
+  test("Split boundary handling - records starting before boundary are read") {
+    withTempDir("seqfile-split-test") { tmpDir =>
+      val file = new File(tmpDir, "split-test.seq")
+      val conf = new Configuration()
+
+      // Create file with multiple records using raw record format (consistent with other tests)
+      val numRecords = 100
+      val payloads = (0 until numRecords).map { i =>
+        s"record-$i-with-some-padding-data".getBytes(StandardCharsets.UTF_8)
+      }.toArray
+
+      writeSequenceFileWithRawRecords(file, conf, payloads)
+
+      withSparkSession { spark =>
+        // Read entire file
+        val df = spark.read
+          .format("com.nvidia.spark.rapids.SequenceFileBinaryFileFormat")
+          .load(file.getAbsolutePath)
+
+        val results = df.select("key", "value").collect()
+        assert(results.length == numRecords,
+          s"Expected $numRecords records, got ${results.length}")
+
+        // Verify all records present and no duplicates
+        val indices = results.map(r => bytesToInt(r.getAs[Array[Byte]](0))).sorted.toSeq
+        val expected = (0 until numRecords).toSeq
+        assert(indices == expected,
+          "Records missing or duplicated")
+      }
+    }
+  }
 }