upgraded spark 3.5.4 to 3.5.5 (#1565)

YanivKunda · web-flow · commit 646dc9b1d2da · 2025-03-24T14:26:27.000-07:00
## Which issue does this PR close? Closes #1461 ## Rationale for this change Spark 3.5.5 is the latest stable 3.5.x version, and should be supported. ## What changes are included in this PR? Just the upgrade of spark 3.5.4 to 3.5.5 and the only code change required in `ShimCometScanExec`. ## How are these changes tested? Ran `mvn test` with the `spark-3.5` profile. This was sufficient since the build failed with just the upgrade and without the required code change.
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -45,7 +45,7 @@ jobs:
       matrix:
         os: [ubuntu-24.04]
         java-version: [11]
-        spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.4'}]
+        spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.5'}]
         module:
           - {name: "catalyst", args1: "catalyst/test", args2: ""}
           - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM apache/datafusion-comet:0.7.0-spark3.5.4-scala2.12-java11
+FROM apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11
 
 RUN apt update \
     && apt install -y git python3 python3-pip \
diff --git a/dev/diffs/3.5.5.diff b/dev/diffs/3.5.5.diff
diff --git a/docs/source/contributor-guide/spark-sql-tests.md b/docs/source/contributor-guide/spark-sql-tests.md
@@ -72,11 +72,11 @@ of Apache Spark to enable Comet when running tests. This is a highly manual proc
 vary depending on the changes in the new version of Spark, but here is a general guide to the process.
 
 We typically start by applying a patch from a previous version of Spark. For example, when enabling the tests 
-for Spark version 3.5.4 we may start by applying the existing diff for 3.4.3 first.
+for Spark version 3.5.5 we may start by applying the existing diff for 3.4.3 first.
 
 ```shell
 cd git/apache/spark
-git checkout v3.5.4
+git checkout v3.5.5
 git apply --reject --whitespace=fix ../datafusion-comet/dev/diffs/3.4.3.diff
 ```
 
@@ -118,7 +118,7 @@ wiggle --replace ./sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.sc
 ## Generating The Diff File
 
 ```shell
-git diff v3.5.4 > ../datafusion-comet/dev/diffs/3.5.4.diff
+git diff v3.5.5 > ../datafusion-comet/dev/diffs/3.5.5.diff
 ```
 
 ## Running Tests in CI 
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -71,9 +71,9 @@ Comet provides the following configuration settings.
 | spark.comet.explain.verbose.enabled | When this setting is enabled, Comet will provide a verbose tree representation of the extended information. | false |
 | spark.comet.explainFallback.enabled | When this setting is enabled, Comet will provide logging explaining the reason(s) why a query stage cannot be executed natively. Set this to false to reduce the amount of logging. | false |
 | spark.comet.expression.allowIncompatible | Comet is not currently fully compatible with Spark for all expressions. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
-| spark.comet.memory.overhead.factor | Fraction of executor memory to be allocated as additional memory for Comet when running in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 0.2 |
-| spark.comet.memory.overhead.min | Minimum amount of additional memory to be allocated per executor process for Comet, in MiB, when running in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 402653184b |
-| spark.comet.memoryOverhead | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running in on-heap mode. This config is optional. If this is not specified, it will be set to `spark.comet.memory.overhead.factor` * `spark.executor.memory`. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | |
+| spark.comet.memory.overhead.factor | Fraction of executor memory to be allocated as additional memory for Comet when running Spark in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 0.2 |
+| spark.comet.memory.overhead.min | Minimum amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 402653184b |
+| spark.comet.memoryOverhead | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. This config is optional. If this is not specified, it will be set to `spark.comet.memory.overhead.factor` * `spark.executor.memory`. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | |
 | spark.comet.metrics.updateInterval | The interval in milliseconds to update metrics. If interval is negative, metrics will be updated upon task completion. | 3000 |
 | spark.comet.nativeLoadRequired | Whether to require Comet native library to load successfully when Comet is enabled. If not, Comet will silently fallback to Spark when it fails to load the native lib. Otherwise, an error will be thrown and the Spark job will be aborted. | false |
 | spark.comet.parquet.enable.directBuffer | Whether to use Java direct byte buffer when reading Parquet. | false |
diff --git a/docs/source/user-guide/kubernetes.md b/docs/source/user-guide/kubernetes.md
@@ -66,10 +66,10 @@ metadata:
 spec:
   type: Scala
   mode: cluster
-  image: apache/datafusion-comet:0.7.0-spark3.5.4-scala2.12-java11
+  image: apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11
   imagePullPolicy: IfNotPresent
   mainClass: org.apache.spark.examples.SparkPi
-  mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.4.jar
+  mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.5.jar
   sparkConf:
     "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar"
     "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar"
@@ -80,17 +80,17 @@ spec:
     "spark.comet.exec.shuffle.enabled": "true"
     "spark.comet.exec.shuffle.mode": "auto"
     "spark.shuffle.manager": "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager"
-  sparkVersion: 3.5.4
+  sparkVersion: 3.5.5
   driver:
     labels:
-      version: 3.5.4
+      version: 3.5.5
     cores: 1
     coreLimit: 1200m
     memory: 512m
     serviceAccount: spark-operator-spark
   executor:
     labels:
-      version: 3.5.4
+      version: 3.5.5
     instances: 1
     cores: 1
     coreLimit: 1200m
diff --git a/pom.xml b/pom.xml
@@ -556,7 +556,7 @@ under the License.
       <id>spark-3.5</id>
       <properties>
         <scala.version>2.12.18</scala.version>
-        <spark.version>3.5.4</spark.version>
+        <spark.version>3.5.5</spark.version>
         <spark.version.short>3.5</spark.version.short>
         <parquet.version>1.13.1</parquet.version>
         <slf4j.version>2.0.7</slf4j.version>
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala
@@ -55,15 +55,15 @@ trait ShimCometScanExec {
   protected def isNeededForSchema(sparkSchema: StructType): Boolean = false
 
   protected def getPartitionedFile(f: FileStatusWithMetadata, p: PartitionDirectory): PartitionedFile =
-    PartitionedFileUtil.getPartitionedFile(f, p.values)
+    PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values)
 
   protected def splitFiles(sparkSession: SparkSession,
                            file: FileStatusWithMetadata,
                            filePath: Path,
                            isSplitable: Boolean,
                            maxSplitBytes: Long,
                            partitionValues: InternalRow): Seq[PartitionedFile] =
-    PartitionedFileUtil.splitFiles(sparkSession, file, isSplitable, maxSplitBytes, partitionValues)
+    PartitionedFileUtil.splitFiles(sparkSession, file, filePath, isSplitable, maxSplitBytes, partitionValues)
 
   protected def getPushedDownFilters(relation: HadoopFsRelation , dataFilters: Seq[Expression]):  Seq[Filter] = {
     val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation)