Skip to content

Commit 0f6436c

Browse files
committed
feat: add Spark 4.0.0
1 parent 7d630e5 commit 0f6436c

File tree

4 files changed

+116
-21
lines changed

4 files changed

+116
-21
lines changed

spark-k8s/Dockerfile

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,17 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patche
6161
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR} /stackable/src/spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR}
6262

6363
RUN <<EOF
64+
65+
# IMPORTANT: HBase connectors don't support Spark 4 yet, so we skip the build.
66+
# Watch this PR for updates: https://github.com/apache/hbase-connectors/pull/130
67+
if [[ "${PRODUCT}" == 4* ]]; then
68+
# Create this empty directory so that following COPY layers succeed.
69+
mkdir -p /stackable/spark/jars
70+
# Create a dummy tarball to satisfy the build process for Spark 3.
71+
touch hbase-connector-${HBASE_CONNECTOR}-stackable${RELEASE}-src.tar.gz
72+
exit 0
73+
fi
74+
6475
cd "$(/stackable/patchable --images-repo-root=src checkout spark-k8s/hbase-connectors ${HBASE_CONNECTOR})/spark"
6576

6677
NEW_VERSION="${HBASE_CONNECTOR}-stackable${RELEASE}"
@@ -110,6 +121,7 @@ mvn \
110121
--define hadoop-three.version="${HADOOP_VERSION}" \
111122
--define hbase.version="${HBASE}" \
112123
--define skipTests \
124+
--define maven.test.skip=true \
113125
clean package
114126

115127
mkdir -p /stackable/spark/jars
@@ -162,9 +174,6 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
162174
COPY --from=hadoop-builder --chown=${STACKABLE_USER_UID}:0 /stackable/patched-libs /stackable/patched-libs
163175

164176
# >>> Build spark
165-
# Compiling the tests takes a lot of time, so we skip them
166-
# -Dmaven.test.skip=true skips both the compilation and execution of tests
167-
# -DskipTests skips only the execution
168177
RUN <<EOF
169178
# Make Maven aware of custom Stackable libraries
170179
mv /stackable/patched-libs/maven /root/.m2/repository
@@ -179,15 +188,35 @@ RUN <<EOF
179188
ORIGINAL_VERSION="${PRODUCT}"
180189
NEW_VERSION="${PRODUCT}-stackable${RELEASE}"
181190

191+
STACKABLE_HADOOP_VERSION="${HADOOP_HADOOP}-stackable${RELEASE}"
192+
193+
MAVEN_BIN="/tmp/apache-maven-${MAVEN_VERSION}/bin/mvn"
182194
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
183195

184-
./dev/make-distribution.sh \
185-
--mvn /tmp/apache-maven-${MAVEN_VERSION}/bin/mvn \
186-
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE}" \
187-
-DskipTests \
188-
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
189-
--no-transfer-progress \
190-
--batch-mode
196+
case "${PRODUCT}" in
197+
4*)
198+
# The Spark 4 script has a --connect option which is not available in Spark 3.
199+
# This option is required to build Spark Connect.
200+
# Also this option breaks the Spark 3 build so we ensure it's only provided here.
201+
./dev/make-distribution.sh \
202+
--mvn "${MAVEN_BIN}" \
203+
--connect \
204+
-Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
205+
-DskipTests \
206+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
207+
--no-transfer-progress \
208+
--batch-mode
209+
;;
210+
*)
211+
./dev/make-distribution.sh \
212+
--mvn "${MAVEN_BIN}" \
213+
-Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
214+
-DskipTests \
215+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
216+
--no-transfer-progress \
217+
--batch-mode
218+
;;
219+
esac
191220

192221
sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" assembly/target/bom.json
193222
EOF
@@ -198,22 +227,30 @@ EOF
198227
# we create a new dist/connect folder, and copy them here.
199228
RUN <<EOF
200229

201-
# Get the Scala binary version
202-
SCALA_BINARY_VERSION=$( \
203-
mvn --quiet --non-recursive --no-transfer-progress --batch-mode --file pom.xml \
204-
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
205-
-DforceStdout \
206-
-Dexpression='project.properties(scala.binary.version)')
230+
SCALA_BINARY_VERSION=$(grep "scala.binary.version" pom.xml | head -n1 | awk -F '[<>]' '{print $3}')
207231

208232
mkdir -p dist/connect
209233
cd dist/connect
210234

211-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
212-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
213-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
214-
215-
# The Spark operator expects a file named spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar without the -stackable${RELEASE} suffix.
235+
case "${PRODUCT}" in
236+
4*)
237+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
238+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
239+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
240+
;;
241+
*)
242+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
243+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
244+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
245+
;;
246+
esac
247+
248+
# This link is needed by the operator and is kept for backwards compatibility.
249+
# TODO: remove it at some time in the future.
216250
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar"
251+
# Link to the spark-connect jar without the stackable suffix and scala version.
252+
# This link supersedes the previous link.
253+
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect-${PRODUCT}.jar"
217254
EOF
218255

219256
# <<< Build spark
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
From 2da5608928018dd017c91b904eb8f84a4f6df78a Mon Sep 17 00:00:00 2001
2+
From: Razvan-Daniel Mihai <[email protected]>
3+
Date: Fri, 4 Jul 2025 15:54:55 +0200
4+
Subject: Update CycloneDX plugin
5+
6+
---
7+
dev/make-distribution.sh | 1 -
8+
pom.xml | 5 +++++
9+
2 files changed, 5 insertions(+), 1 deletion(-)
10+
11+
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
12+
index 16607e45ae..44e345a245 100755
13+
--- a/dev/make-distribution.sh
14+
+++ b/dev/make-distribution.sh
15+
@@ -176,7 +176,6 @@ BUILD_COMMAND=("$MVN" clean package \
16+
-Dmaven.javadoc.skip=true \
17+
-Dmaven.scaladoc.skip=true \
18+
-Dmaven.source.skip \
19+
- -Dcyclonedx.skip=true \
20+
$@)
21+
22+
# Actually build the jar
23+
diff --git a/pom.xml b/pom.xml
24+
index 443d46a430..632920f100 100644
25+
--- a/pom.xml
26+
+++ b/pom.xml
27+
@@ -3327,6 +3327,11 @@
28+
<groupId>org.cyclonedx</groupId>
29+
<artifactId>cyclonedx-maven-plugin</artifactId>
30+
<version>2.8.0</version>
31+
+ <configuration>
32+
+ <projectType>application</projectType>
33+
+ <schemaVersion>1.5</schemaVersion>
34+
+ <skipNotDeployed>false</skipNotDeployed>
35+
+ </configuration>
36+
<executions>
37+
<execution>
38+
<phase>package</phase>
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
base = "fa33ea000a0bda9e5a3fa1af98e8e85b8cc5e4d4"
2+
mirror = "https://github.com/stackabletech/spark.git"

spark-k8s/versions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,22 @@
3535
"tini": "0.19.0",
3636
"hbase_connector": "1.0.1",
3737
},
38+
{
39+
"product": "4.0.0",
40+
"java-base": "17",
41+
"java-devel": "17",
42+
"python": "3.11",
43+
"hadoop/hadoop": "3.4.1",
44+
"hbase": "2.6.2",
45+
"aws_java_sdk_bundle": "2.24.6",
46+
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
47+
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
48+
"jackson_dataformat_xml": "2.15.2", # https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.13/3.5.1
49+
"stax2_api": "4.2.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
50+
"woodstox_core": "6.5.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
51+
"vector": "0.47.0",
52+
"jmx_exporter": "1.3.0",
53+
"tini": "0.19.0",
54+
"hbase_connector": "1.0.1",
55+
},
3856
]

0 commit comments

Comments
 (0)