NVIDIA · thirtiseven · Dec 19, 2025 · Dec 22, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/integration_tests/src/main/python/sequencefile_test.py b/integration_tests/src/main/python/sequencefile_test.py
diff --git a/...ugin/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/...ugin/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -0,0 +1 @@
+com.nvidia.spark.rapids.SequenceFileBinaryFileFormat
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuReadSequenceFileBinaryFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuReadSequenceFileBinaryFormat.scala
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import com.nvidia.spark.rapids.sequencefile.GpuSequenceFileMultiFilePartitionReaderFactory
+import com.nvidia.spark.rapids.sequencefile.GpuSequenceFilePartitionReaderFactory
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.execution.FileSourceScanExec
+import org.apache.spark.sql.execution.datasources.{FileFormat, PartitionedFile}
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.SerializableConfiguration
+
+/**
+ * A FileFormat that allows reading Hadoop SequenceFiles and returning raw key/value bytes as
+ * Spark SQL BinaryType columns.
+ *
+ * This is a GPU-enabled scan format in the sense that it returns GPU-backed ColumnarBatch output
+ * (the parsing itself is CPU-side IO + byte parsing).
+ */
+class GpuReadSequenceFileBinaryFormat extends FileFormat with GpuReadFileFormatWithMetrics {
+
+  override def inferSchema(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = Some(SequenceFileBinaryFileFormat.dataSchema)
+
+  // TODO: Fix split boundary handling to enable multi-partition reads
+  // Currently disabled to ensure correct record counts
+  override def isSplitable(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      path: Path): Boolean = false
+
+  override def buildReaderWithPartitionValuesAndMetrics(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      requiredSchema: StructType,
+      filters: Seq[Filter],
+      options: Map[String, String],
+      hadoopConf: Configuration,
+      metrics: Map[String, GpuMetric]): PartitionedFile => Iterator[InternalRow] = {
+    val sqlConf = sparkSession.sessionState.conf
+    val broadcastedHadoopConf =
+      sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+    val rapidsConf = new RapidsConf(sqlConf)
+
+    val factory = GpuSequenceFilePartitionReaderFactory(
+      sqlConf,
+      broadcastedHadoopConf,
+      requiredSchema,
+      partitionSchema,
+      rapidsConf,
+      metrics,
+      options)
+    PartitionReaderIterator.buildReader(factory)
+  }
+
+  // Default to multi-file reads (recommended for many small files).
+  override def isPerFileReadEnabled(conf: RapidsConf): Boolean = false
+
+  override def createMultiFileReaderFactory(
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    GpuSequenceFileMultiFilePartitionReaderFactory(
+      fileScan.conf,
+      broadcastedConf,
+      fileScan.requiredSchema,
+      fileScan.readPartitionSchema,
+      fileScan.rapidsConf,
+      fileScan.allMetrics,
+      fileScan.queryUsesInputFile)
+  }
+}
+
+object GpuReadSequenceFileBinaryFormat {
+  def tagSupport(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    val fsse = meta.wrapped
+    val required = fsse.requiredSchema
+    // Only support reading BinaryType columns named "key" and/or "value".
+    required.fields.foreach { f =>
+      val isKey = f.name.equalsIgnoreCase(SequenceFileBinaryFileFormat.KEY_FIELD)
+      val isValue = f.name.equalsIgnoreCase(SequenceFileBinaryFileFormat.VALUE_FIELD)
+      if ((isKey || isValue) && f.dataType != org.apache.spark.sql.types.BinaryType) {
+        meta.willNotWorkOnGpu(
+          s"SequenceFileBinary only supports BinaryType for " +
+            s"'${SequenceFileBinaryFileFormat.KEY_FIELD}' and " +
+            s"'${SequenceFileBinaryFileFormat.VALUE_FIELD}' columns, but saw " +
+            s"${f.name}: ${f.dataType.catalogString}")
+      }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1678,6 +1678,33 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
       .checkValue(v => v > 0, "The maximum number of files must be greater than 0.")
       .createWithDefault(Integer.MAX_VALUE)
 
+  val SEQUENCEFILE_READER_TYPE = conf("spark.rapids.sql.format.sequencefile.reader.type")
+    .doc("Sets the SequenceFile reader type. Since SequenceFile decoding happens on the CPU " +
+      "(using Hadoop's SequenceFile.Reader), COALESCING mode is not supported and will throw " +
+      "an exception. Use PERFILE which individually reads files, or MULTITHREADED which uses " +
+      "multiple threads to read files in parallel, utilizing multiple CPU cores for I/O and " +
+      "decoding. MULTITHREADED is recommended when reading many files as it allows the CPU to " +
+      "keep reading while GPU is also doing work. " +
+      s"See $MULTITHREAD_READ_NUM_THREADS and " +
+      "spark.rapids.sql.format.sequencefile.multiThreadedRead.maxNumFilesParallel to control " +
+      "the number of threads and amount of memory used. " +
+      "By default this is set to AUTO which selects MULTITHREADED for cloud storage and " +
+      "PERFILE for local storage. See spark.rapids.cloudSchemes.")
+    .stringConf
+    .transform(_.toUpperCase(java.util.Locale.ROOT))
+    .checkValues(RapidsReaderType.values.map(_.toString))
+    .createWithDefault(RapidsReaderType.AUTO.toString)
+
+  val SEQUENCEFILE_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL =
+    conf("spark.rapids.sql.format.sequencefile.multiThreadedRead.maxNumFilesParallel")
+      .doc("A limit on the maximum number of files per task processed in parallel on the CPU " +
+        "side before the file is sent to the GPU. This affects the amount of host memory used " +
+        "when reading the files in parallel. Used with MULTITHREADED reader, see " +
+        s"$SEQUENCEFILE_READER_TYPE.")
+      .integerConf
+      .checkValue(v => v > 0, "The maximum number of files must be greater than 0.")
+      .createWithDefault(Integer.MAX_VALUE)
+
   val ENABLE_DELTA_WRITE = conf("spark.rapids.sql.format.delta.write.enabled")
       .doc("When set to false disables Delta Lake output acceleration.")
       .booleanConf
@@ -3548,6 +3575,26 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val maxNumAvroFilesParallel: Int = get(AVRO_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
 
+  lazy val isSequenceFilePerFileReadEnabled: Boolean = {
+    val readerType = RapidsReaderType.withName(get(SEQUENCEFILE_READER_TYPE))
+    if (readerType == RapidsReaderType.COALESCING) {
+      throw new IllegalArgumentException(
+        s"COALESCING reader type is not supported for SequenceFile. " +
+        s"SequenceFile decoding happens on CPU, so coalescing provides no benefit. " +
+        s"Use PERFILE, MULTITHREADED, or AUTO instead.")
+    }
+    readerType == RapidsReaderType.PERFILE
+  }
+
+  lazy val isSequenceFileAutoReaderEnabled: Boolean =
+    RapidsReaderType.withName(get(SEQUENCEFILE_READER_TYPE)) == RapidsReaderType.AUTO
+
+  lazy val isSequenceFileMultiThreadReadEnabled: Boolean = isSequenceFileAutoReaderEnabled ||
+    RapidsReaderType.withName(get(SEQUENCEFILE_READER_TYPE)) == RapidsReaderType.MULTITHREADED
+
+  lazy val maxNumSequenceFilesParallel: Int = get(
+    SEQUENCEFILE_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
+
   lazy val isDeltaWriteEnabled: Boolean = get(ENABLE_DELTA_WRITE)
 
   lazy val isIcebergEnabled: Boolean = get(ENABLE_ICEBERG)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		com.nvidia.spark.rapids.SequenceFileBinaryFileFormat