Basic S3 test and properties support

mbutrovich · mbutrovich · commit 1c40d43b29b5 · 2025-10-21T13:04:53.000-04:00
diff --git a/dev/ci/check-suites.py b/dev/ci/check-suites.py
@@ -34,6 +34,7 @@ def file_to_class_name(path: Path) -> str | None:
     ignore_list = [
         "org.apache.comet.parquet.ParquetReadSuite", # abstract
         "org.apache.comet.parquet.ParquetReadFromS3Suite", # manual test suite
+        "org.apache.comet.IcebergReadFromS3Suite", # manual test suite
         "org.apache.spark.sql.comet.CometPlanStabilitySuite", # abstract
         "org.apache.spark.sql.comet.ParquetDatetimeRebaseSuite", # abstract
         "org.apache.comet.exec.CometColumnarShuffleSuite" # abstract
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -269,6 +269,57 @@ object QueryPlanSerde extends Logging with CometExprShim {
     classOf[VariancePop] -> CometVariancePop,
     classOf[VarianceSamp] -> CometVarianceSamp)
 
+  /**
+   * Transforms Hadoop S3A configuration keys to Iceberg FileIO property keys.
+   *
+   * Iceberg-rust's FileIO expects Iceberg-format keys (e.g., s3.access-key-id), not Hadoop keys
+   * (e.g., fs.s3a.access.key). This function converts Hadoop keys extracted from Spark's
+   * configuration to the format expected by iceberg-rust.
+   *
+   * @param hadoopProps
+   *   Map of Hadoop configuration properties (fs.s3a.* keys)
+   * @return
+   *   Map with keys transformed to Iceberg format (s3.* keys)
+   */
+  private def hadoopToIcebergS3Properties(
+      hadoopProps: Map[String, String]): Map[String, String] = {
+    hadoopProps.flatMap { case (key, value) =>
+      key match {
+        // Global S3A configuration keys
+        case "fs.s3a.access.key" => Some("s3.access-key-id" -> value)
+        case "fs.s3a.secret.key" => Some("s3.secret-access-key" -> value)
+        case "fs.s3a.endpoint" => Some("s3.endpoint" -> value)
+        case "fs.s3a.path.style.access" => Some("s3.path-style-access" -> value)
+        case "fs.s3a.endpoint.region" => Some("s3.region" -> value)
+
+        // Per-bucket configuration keys (e.g., fs.s3a.bucket.mybucket.access.key)
+        // Extract bucket name and property, then transform to s3.* format
+        case k if k.startsWith("fs.s3a.bucket.") =>
+          val parts = k.stripPrefix("fs.s3a.bucket.").split("\\.", 2)
+          if (parts.length == 2) {
+            val bucket = parts(0)
+            val property = parts(1)
+            property match {
+              case "access.key" => Some(s"s3.bucket.$bucket.access-key-id" -> value)
+              case "secret.key" => Some(s"s3.bucket.$bucket.secret-access-key" -> value)
+              case "endpoint" => Some(s"s3.bucket.$bucket.endpoint" -> value)
+              case "path.style.access" => Some(s"s3.bucket.$bucket.path-style-access" -> value)
+              case "endpoint.region" => Some(s"s3.bucket.$bucket.region" -> value)
+              case _ => None // Ignore unrecognized per-bucket properties
+            }
+          } else {
+            None
+          }
+
+        // Pass through any keys that are already in Iceberg format
+        case k if k.startsWith("s3.") => Some(key -> value)
+
+        // Ignore all other keys
+        case _ => None
+      }
+    }
+  }
+
   def supportedDataType(dt: DataType, allowComplex: Boolean = false): Boolean = dt match {
     case _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType |
         _: DoubleType | _: StringType | _: BinaryType | _: TimestampType | _: TimestampNTZType |
@@ -1113,8 +1164,26 @@ object QueryPlanSerde extends Logging with CometExprShim {
         // Set metadata location
         icebergScanBuilder.setMetadataLocation(metadataLocation)
 
-        // Serialize catalog properties (for authentication - currently empty)
-        // TODO: Extract credentials, S3 config, etc.
+        val catalogProperties =
+          try {
+            val session = org.apache.spark.sql.SparkSession.active
+            val hadoopConf = session.sessionState.newHadoopConf()
+
+            val metadataUri = new java.net.URI(metadataLocation)
+            val hadoopS3Options =
+              NativeConfig.extractObjectStoreOptions(hadoopConf, metadataUri)
+
+            hadoopToIcebergS3Properties(hadoopS3Options)
+          } catch {
+            case e: Exception =>
+              logWarning(
+                s"Failed to extract catalog properties from Iceberg scan: ${e.getMessage}")
+              e.printStackTrace()
+              Map.empty[String, String]
+          }
+        catalogProperties.foreach { case (key, value) =>
+          icebergScanBuilder.putCatalogProperties(key, value)
+        }
 
         // Determine number of partitions from Iceberg's output partitioning
         // TODO: Add a test case for both partitioning schemes
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometIcebergNativeScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometIcebergNativeScanExec.scala
@@ -159,8 +159,6 @@ object CometIcebergNativeScanExec {
   /**
    * Extracts metadata location from Iceberg table.
    *
-   * TODO: Also extract catalog properties (credentials, S3 config, etc.) for authentication
-   *
    * @param scanExec
    *   The Spark BatchScanExec containing an Iceberg scan
    * @return
diff --git a/spark/src/test/scala/org/apache/comet/CometS3TestBase.scala b/spark/src/test/scala/org/apache/comet/CometS3TestBase.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import java.net.URI
+
+import scala.util.Try
+
+import org.testcontainers.containers.MinIOContainer
+import org.testcontainers.utility.DockerImageName
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.CometTestBase
+
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import software.amazon.awssdk.services.s3.S3Client
+import software.amazon.awssdk.services.s3.model.{CreateBucketRequest, HeadBucketRequest}
+
+trait CometS3TestBase extends CometTestBase {
+
+  protected var minioContainer: MinIOContainer = _
+  protected val userName = "minio-test-user"
+  protected val password = "minio-test-password"
+
+  protected def testBucketName: String
+
+  override def beforeAll(): Unit = {
+    minioContainer = new MinIOContainer(DockerImageName.parse("minio/minio:latest"))
+      .withUserName(userName)
+      .withPassword(password)
+    minioContainer.start()
+    createBucketIfNotExists(testBucketName)
+
+    super.beforeAll()
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    if (minioContainer != null) {
+      minioContainer.stop()
+    }
+  }
+
+  override protected def sparkConf: SparkConf = {
+    val conf = super.sparkConf
+    conf.set("spark.hadoop.fs.s3a.access.key", userName)
+    conf.set("spark.hadoop.fs.s3a.secret.key", password)
+    conf.set("spark.hadoop.fs.s3a.endpoint", minioContainer.getS3URL)
+    conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
+  }
+
+  protected def createBucketIfNotExists(bucketName: String): Unit = {
+    val credentials = AwsBasicCredentials.create(userName, password)
+    val s3Client = S3Client
+      .builder()
+      .endpointOverride(URI.create(minioContainer.getS3URL))
+      .credentialsProvider(StaticCredentialsProvider.create(credentials))
+      .forcePathStyle(true)
+      .build()
+    try {
+      val bucketExists = Try {
+        s3Client.headBucket(HeadBucketRequest.builder().bucket(bucketName).build())
+        true
+      }.getOrElse(false)
+
+      if (!bucketExists) {
+        val request = CreateBucketRequest.builder().bucket(bucketName).build()
+        s3Client.createBucket(request)
+      }
+    } finally {
+      s3Client.close()
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/comet/IcebergReadFromS3Suite.scala b/spark/src/test/scala/org/apache/comet/IcebergReadFromS3Suite.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.comet.CometIcebergNativeScanExec
+import org.apache.spark.sql.execution.SparkPlan
+
+class IcebergReadFromS3Suite extends CometS3TestBase {
+
+  override protected val testBucketName = "test-iceberg-bucket"
+
+  private def icebergAvailable: Boolean = {
+    try {
+      Class.forName("org.apache.iceberg.catalog.Catalog")
+      true
+    } catch {
+      case _: ClassNotFoundException => false
+    }
+  }
+
+  override protected def sparkConf: SparkConf = {
+    val conf = super.sparkConf
+
+    conf.set("spark.sql.catalog.s3_catalog", "org.apache.iceberg.spark.SparkCatalog")
+    conf.set("spark.sql.catalog.s3_catalog.type", "hadoop")
+    conf.set("spark.sql.catalog.s3_catalog.warehouse", s"s3a://$testBucketName/warehouse")
+
+    conf.set(CometConf.COMET_ENABLED.key, "true")
+    conf.set(CometConf.COMET_EXEC_ENABLED.key, "true")
+    conf.set(CometConf.COMET_ICEBERG_NATIVE_ENABLED.key, "true")
+
+    conf
+  }
+
+  /** Collects all CometIcebergNativeScanExec nodes from a plan */
+  private def collectIcebergNativeScans(plan: SparkPlan): Seq[CometIcebergNativeScanExec] = {
+    collect(plan) { case scan: CometIcebergNativeScanExec =>
+      scan
+    }
+  }
+
+  /**
+   * Helper to verify query correctness and that exactly one CometIcebergNativeScanExec is used.
+   */
+  private def checkIcebergNativeScan(query: String): Unit = {
+    val (_, cometPlan) = checkSparkAnswer(query)
+    val icebergScans = collectIcebergNativeScans(cometPlan)
+    assert(
+      icebergScans.length == 1,
+      s"Expected exactly 1 CometIcebergNativeScanExec but found ${icebergScans.length}. Plan:\n$cometPlan")
+  }
+
+  test("create and query simple Iceberg table from MinIO") {
+    assume(icebergAvailable, "Iceberg not available in classpath")
+
+    spark.sql("""
+      CREATE TABLE s3_catalog.db.simple_table (
+        id INT,
+        name STRING,
+        value DOUBLE
+      ) USING iceberg
+    """)
+
+    spark.sql("""
+      INSERT INTO s3_catalog.db.simple_table
+      VALUES (1, 'Alice', 10.5), (2, 'Bob', 20.3), (3, 'Charlie', 30.7)
+    """)
+
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.simple_table ORDER BY id")
+
+    spark.sql("DROP TABLE s3_catalog.db.simple_table")
+  }
+
+  test("read partitioned Iceberg table from MinIO") {
+    assume(icebergAvailable, "Iceberg not available in classpath")
+
+    spark.sql("""
+      CREATE TABLE s3_catalog.db.partitioned_table (
+        id INT,
+        category STRING,
+        value DOUBLE
+      ) USING iceberg
+      PARTITIONED BY (category)
+    """)
+
+    spark.sql("""
+      INSERT INTO s3_catalog.db.partitioned_table VALUES
+      (1, 'A', 10.5), (2, 'B', 20.3), (3, 'C', 30.7),
+      (4, 'A', 15.2), (5, 'B', 25.8), (6, 'C', 35.0)
+    """)
+
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.partitioned_table ORDER BY id")
+    checkIcebergNativeScan(
+      "SELECT * FROM s3_catalog.db.partitioned_table WHERE category = 'A' ORDER BY id")
+
+    spark.sql("DROP TABLE s3_catalog.db.partitioned_table")
+  }
+
+  test("filter pushdown to S3-backed Iceberg table") {
+    assume(icebergAvailable, "Iceberg not available in classpath")
+
+    spark.sql("""
+      CREATE TABLE s3_catalog.db.filter_test (
+        id INT,
+        name STRING,
+        value DOUBLE
+      ) USING iceberg
+    """)
+
+    spark.sql("""
+      INSERT INTO s3_catalog.db.filter_test VALUES
+      (1, 'Alice', 10.5), (2, 'Bob', 20.3), (3, 'Charlie', 30.7),
+      (4, 'Diana', 15.2), (5, 'Eve', 25.8)
+    """)
+
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.filter_test WHERE id = 3")
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.filter_test WHERE value > 20.0")
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.filter_test WHERE name = 'Alice'")
+
+    spark.sql("DROP TABLE s3_catalog.db.filter_test")
+  }
+
+  test("multiple files in S3 - verify no duplicates") {
+    assume(icebergAvailable, "Iceberg not available in classpath")
+
+    withSQLConf("spark.sql.files.maxRecordsPerFile" -> "50") {
+      spark.sql("""
+        CREATE TABLE s3_catalog.db.multifile_test (
+          id INT,
+          data STRING
+        ) USING iceberg
+      """)
+
+      spark.sql("""
+        INSERT INTO s3_catalog.db.multifile_test
+        SELECT id, CONCAT('data_', CAST(id AS STRING)) as data
+        FROM range(200)
+      """)
+
+      checkIcebergNativeScan("SELECT COUNT(DISTINCT id) FROM s3_catalog.db.multifile_test")
+      checkIcebergNativeScan(
+        "SELECT * FROM s3_catalog.db.multifile_test WHERE id < 10 ORDER BY id")
+
+      spark.sql("DROP TABLE s3_catalog.db.multifile_test")
+    }
+  }
+
+  test("MOR table with deletes in S3") {
+    assume(icebergAvailable, "Iceberg not available in classpath")
+
+    spark.sql("""
+      CREATE TABLE s3_catalog.db.mor_delete_test (
+        id INT,
+        name STRING,
+        value DOUBLE
+      ) USING iceberg
+      TBLPROPERTIES (
+        'write.delete.mode' = 'merge-on-read',
+        'write.merge.mode' = 'merge-on-read'
+      )
+    """)
+
+    spark.sql("""
+      INSERT INTO s3_catalog.db.mor_delete_test VALUES
+      (1, 'Alice', 10.5), (2, 'Bob', 20.3), (3, 'Charlie', 30.7),
+      (4, 'Diana', 15.2), (5, 'Eve', 25.8)
+    """)
+
+    spark.sql("DELETE FROM s3_catalog.db.mor_delete_test WHERE id IN (2, 4)")
+
+    checkIcebergNativeScan("SELECT * FROM s3_catalog.db.mor_delete_test ORDER BY id")
+
+    spark.sql("DROP TABLE s3_catalog.db.mor_delete_test")
+  }
+}
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadFromS3Suite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadFromS3Suite.scala

Original file line number	Diff line number	Diff line change
`@@ -159,8 +159,6 @@ object CometIcebergNativeScanExec {`
`159`	`159`	`/**`
`160`	`160`	`* Extracts metadata location from Iceberg table.`
`161`	`161`	`*`
`162`		`- * TODO: Also extract catalog properties (credentials, S3 config, etc.) for authentication`
`163`		`- *`
`164`	`162`	`* @param scanExec`
`165`	`163`	`* The Spark BatchScanExec containing an Iceberg scan`
`166`	`164`	`* @return`