Skip to content

Commit 5de2413

Browse files
committed
Merge remote-tracking branch 'apache/main' into regexp-extract-impl
2 parents a55263f + 614fe13 commit 5de2413

File tree

1,433 files changed

+202967
-98700
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,433 files changed

+202967
-98700
lines changed

.github/actions/setup-iceberg-builder/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ runs:
2525
using: "composite"
2626
steps:
2727
- name: Clone Iceberg repo
28-
uses: actions/checkout@v4
28+
uses: actions/checkout@v6
2929
with:
3030
repository: apache/iceberg
3131
path: apache-iceberg

.github/actions/setup-iceberg-rust-builder/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ runs:
2525
using: "composite"
2626
steps:
2727
- name: Clone Iceberg repo
28-
uses: actions/checkout@v4
28+
uses: actions/checkout@v6
2929
with:
3030
repository: apache/iceberg
3131
path: apache-iceberg

.github/actions/setup-spark-builder/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ runs:
2828
using: "composite"
2929
steps:
3030
- name: Clone Spark repo
31-
uses: actions/checkout@v4
31+
uses: actions/checkout@v6
3232
with:
3333
repository: apache/spark
3434
path: apache-spark

.github/workflows/benchmark-tpcds.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
rust-version: ${{env.RUST_VERSION}}
5656
jdk-version: 11
5757
- name: Cache Maven dependencies
58-
uses: actions/cache@v4
58+
uses: actions/cache@v5
5959
with:
6060
path: |
6161
~/.m2/repository
@@ -67,7 +67,7 @@ jobs:
6767
run: make release
6868
- name: Cache TPC-DS generated data
6969
id: cache-tpcds-sf-1
70-
uses: actions/cache@v4
70+
uses: actions/cache@v5
7171
with:
7272
path: ./tpcds-sf-1
7373
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
@@ -107,7 +107,7 @@ jobs:
107107
rust-version: ${{env.RUST_VERSION}}
108108
jdk-version: 11
109109
- name: Cache Maven dependencies
110-
uses: actions/cache@v4
110+
uses: actions/cache@v5
111111
with:
112112
path: |
113113
~/.m2/repository
@@ -117,7 +117,7 @@ jobs:
117117
${{ runner.os }}-java-maven-
118118
- name: Restore TPC-DS generated data
119119
id: cache-tpcds-sf-1
120-
uses: actions/cache/restore@v4
120+
uses: actions/cache/restore@v5
121121
with:
122122
path: ./tpcds-sf-1
123123
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}

.github/workflows/benchmark-tpch.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
rust-version: ${{env.RUST_VERSION}}
5656
jdk-version: 11
5757
- name: Cache Maven dependencies
58-
uses: actions/cache@v4
58+
uses: actions/cache@v5
5959
with:
6060
path: |
6161
~/.m2/repository
@@ -65,7 +65,7 @@ jobs:
6565
${{ runner.os }}-java-maven-
6666
- name: Cache TPC-H generated data
6767
id: cache-tpch-sf-1
68-
uses: actions/cache@v4
68+
uses: actions/cache@v5
6969
with:
7070
path: ./tpch
7171
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
@@ -91,7 +91,7 @@ jobs:
9191
rust-version: ${{env.RUST_VERSION}}
9292
jdk-version: 11
9393
- name: Cache Maven dependencies
94-
uses: actions/cache@v4
94+
uses: actions/cache@v5
9595
with:
9696
path: |
9797
~/.m2/repository
@@ -101,7 +101,7 @@ jobs:
101101
${{ runner.os }}-java-maven-
102102
- name: Restore TPC-H generated data
103103
id: cache-tpch-sf-1
104-
uses: actions/cache/restore@v4
104+
uses: actions/cache/restore@v5
105105
with:
106106
path: ./tpch
107107
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}

.github/workflows/pr_build_linux.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ jobs:
132132
org.apache.comet.exec.CometAggregateSuite
133133
org.apache.comet.exec.CometExec3_4PlusSuite
134134
org.apache.comet.exec.CometExecSuite
135+
org.apache.comet.exec.CometGenerateExecSuite
135136
org.apache.comet.exec.CometWindowExecSuite
136137
org.apache.comet.exec.CometJoinSuite
137138
org.apache.comet.CometNativeSuite
@@ -140,6 +141,8 @@ jobs:
140141
org.apache.spark.CometPluginsDefaultSuite
141142
org.apache.spark.CometPluginsNonOverrideSuite
142143
org.apache.spark.CometPluginsUnifiedModeOverrideSuite
144+
org.apache.comet.rules.CometScanRuleSuite
145+
org.apache.comet.rules.CometExecRuleSuite
143146
org.apache.spark.sql.CometTPCDSQuerySuite
144147
org.apache.spark.sql.CometTPCDSQueryTestSuite
145148
org.apache.spark.sql.CometTPCHQuerySuite
@@ -158,6 +161,7 @@ jobs:
158161
org.apache.comet.CometStringExpressionSuite
159162
org.apache.comet.CometBitwiseExpressionSuite
160163
org.apache.comet.CometMapExpressionSuite
164+
org.apache.comet.CometJsonExpressionSuite
161165
org.apache.comet.expressions.conditional.CometIfSuite
162166
org.apache.comet.expressions.conditional.CometCoalesceSuite
163167
org.apache.comet.expressions.conditional.CometCaseWhenSuite

.github/workflows/pr_build_macos.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,9 @@ jobs:
5757
java_version: "17"
5858
maven_opts: "-Pspark-3.5 -Pscala-2.13"
5959

60-
# TODO fails with OOM
61-
# https://github.com/apache/datafusion-comet/issues/1949
62-
# - name: "Spark 4.0, JDK 17, Scala 2.13"
63-
# java_version: "17"
64-
# maven_opts: "-Pspark-4.0 -Pscala-2.13"
60+
- name: "Spark 4.0, JDK 17, Scala 2.13"
61+
java_version: "17"
62+
maven_opts: "-Pspark-4.0 -Pscala-2.13"
6563

6664
suite:
6765
- name: "fuzz"
@@ -97,6 +95,7 @@ jobs:
9795
org.apache.comet.exec.CometAggregateSuite
9896
org.apache.comet.exec.CometExec3_4PlusSuite
9997
org.apache.comet.exec.CometExecSuite
98+
org.apache.comet.exec.CometGenerateExecSuite
10099
org.apache.comet.exec.CometWindowExecSuite
101100
org.apache.comet.exec.CometJoinSuite
102101
org.apache.comet.CometNativeSuite
@@ -105,6 +104,8 @@ jobs:
105104
org.apache.spark.CometPluginsDefaultSuite
106105
org.apache.spark.CometPluginsNonOverrideSuite
107106
org.apache.spark.CometPluginsUnifiedModeOverrideSuite
107+
org.apache.comet.rules.CometScanRuleSuite
108+
org.apache.comet.rules.CometExecRuleSuite
108109
org.apache.spark.sql.CometTPCDSQuerySuite
109110
org.apache.spark.sql.CometTPCDSQueryTestSuite
110111
org.apache.spark.sql.CometTPCHQuerySuite
@@ -123,6 +124,7 @@ jobs:
123124
org.apache.comet.CometStringExpressionSuite
124125
org.apache.comet.CometBitwiseExpressionSuite
125126
org.apache.comet.CometMapExpressionSuite
127+
org.apache.comet.CometJsonExpressionSuite
126128
org.apache.comet.expressions.conditional.CometIfSuite
127129
org.apache.comet.expressions.conditional.CometCoalesceSuite
128130
org.apache.comet.expressions.conditional.CometCaseWhenSuite

.github/workflows/spark_sql_test.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ jobs:
5959
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6060
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
6161
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
62+
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
63+
exclude:
64+
- spark-version: {short: '4.0', full: '4.0.1', java: 17}
65+
module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6266
fail-fast: false
6367
name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
6468
runs-on: ${{ matrix.os }}
@@ -89,7 +93,7 @@ jobs:
8993
LC_ALL: "C.UTF-8"
9094
- name: Upload fallback log
9195
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
92-
uses: actions/upload-artifact@v5
96+
uses: actions/upload-artifact@v6
9397
with:
9498
name: fallback-log-spark-sql-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.spark-version.java }}
9599
path: "**/fallback.log"
@@ -138,7 +142,7 @@ jobs:
138142
LC_ALL: "C.UTF-8"
139143
- name: Upload fallback log
140144
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
141-
uses: actions/upload-artifact@v5
145+
uses: actions/upload-artifact@v6
142146
with:
143147
name: fallback-log-spark-sql-native-comet-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
144148
path: "**/fallback.log"
@@ -187,7 +191,7 @@ jobs:
187191
LC_ALL: "C.UTF-8"
188192
- name: Upload fallback log
189193
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
190-
uses: actions/upload-artifact@v5
194+
uses: actions/upload-artifact@v6
191195
with:
192196
name: fallback-log-spark-sql-iceberg-compat-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
193197
path: "**/fallback.log"
@@ -199,14 +203,14 @@ jobs:
199203
runs-on: ubuntu-24.04
200204
steps:
201205
- name: Download fallback log artifacts
202-
uses: actions/download-artifact@v6
206+
uses: actions/download-artifact@v7
203207
with:
204208
path: fallback-logs/
205209
- name: Merge fallback logs
206210
run: |
207211
find ./fallback-logs/ -type f -name "fallback.log" -print0 | xargs -0 cat | sort -u > all_fallback.log
208212
- name: Upload merged fallback log
209-
uses: actions/upload-artifact@v5
213+
uses: actions/upload-artifact@v6
210214
with:
211215
name: all-fallback-log
212216
path: all_fallback.log

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ To get started with Apache DataFusion Comet, follow the
102102
[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
103103
with other users, ask questions, and share your experiences with Comet.
104104

105-
Follow [Apache DataFusion Comet Overview](https://datafusion.apache.org/comet/user-guide/overview.html) to get more detailed information
105+
Follow [Apache DataFusion Comet Overview](https://datafusion.apache.org/comet/about/index.html#comet-overview) to get more detailed information
106106

107107
## Contributing
108108

common/src/main/java/org/apache/comet/parquet/CometFileKeyUnwrapper.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,35 @@ public class CometFileKeyUnwrapper {
101101
// Cache the hadoopConf just to assert the assumption above.
102102
private Configuration conf = null;
103103

104+
/**
105+
* Normalizes S3 URI schemes to a canonical form. S3 can be accessed via multiple schemes (s3://,
106+
* s3a://, s3n://) that refer to the same logical filesystem. This method ensures consistent cache
107+
* lookups regardless of which scheme is used.
108+
*
109+
* @param filePath The file path that may contain an S3 URI
110+
* @return The file path with normalized S3 scheme (s3a://)
111+
*/
112+
private String normalizeS3Scheme(final String filePath) {
113+
// Normalize s3:// and s3n:// to s3a:// for consistent cache lookups
114+
// This handles the case where ObjectStoreUrl uses s3:// but Spark uses s3a://
115+
String s3Prefix = "s3://";
116+
String s3nPrefix = "s3n://";
117+
if (filePath.startsWith(s3Prefix)) {
118+
return "s3a://" + filePath.substring(s3Prefix.length());
119+
} else if (filePath.startsWith(s3nPrefix)) {
120+
return "s3a://" + filePath.substring(s3nPrefix.length());
121+
}
122+
return filePath;
123+
}
124+
104125
/**
105126
* Creates and stores a DecryptionKeyRetriever instance for the given file path.
106127
*
107128
* @param filePath The path to the Parquet file
108129
* @param hadoopConf The Hadoop Configuration to use for this file path
109130
*/
110131
public void storeDecryptionKeyRetriever(final String filePath, final Configuration hadoopConf) {
132+
final String normalizedPath = normalizeS3Scheme(filePath);
111133
// Use DecryptionPropertiesFactory.loadFactory to get the factory and then call
112134
// getFileDecryptionProperties
113135
if (factory == null) {
@@ -122,7 +144,7 @@ public void storeDecryptionKeyRetriever(final String filePath, final Configurati
122144
factory.getFileDecryptionProperties(hadoopConf, path);
123145

124146
DecryptionKeyRetriever keyRetriever = decryptionProperties.getKeyRetriever();
125-
retrieverCache.put(filePath, keyRetriever);
147+
retrieverCache.put(normalizedPath, keyRetriever);
126148
}
127149

128150
/**
@@ -136,7 +158,8 @@ public void storeDecryptionKeyRetriever(final String filePath, final Configurati
136158
*/
137159
public byte[] getKey(final String filePath, final byte[] keyMetadata)
138160
throws ParquetCryptoRuntimeException {
139-
DecryptionKeyRetriever keyRetriever = retrieverCache.get(filePath);
161+
final String normalizedPath = normalizeS3Scheme(filePath);
162+
DecryptionKeyRetriever keyRetriever = retrieverCache.get(normalizedPath);
140163
if (keyRetriever == null) {
141164
throw new ParquetCryptoRuntimeException(
142165
"Failed to find DecryptionKeyRetriever for path: " + filePath);

0 commit comments

Comments
 (0)