Skip to content

Commit 828fcad

Browse files
razvanmaltesander
andauthored
fix(spark-k8s): refactor for Spark Connect (#1034)
* fix(spark): JAVA_HOME needs to point to OpenJDK * move env directive out of heredoc * add spark-connect jars to dist/connect folder * fix typo * added spark-connect-client image * Use Temurin instead of OpenJDK * cleanup spark-connect-client * changelog * make connect app configurable * cleanup comment * changelog * pr uri * fix merge * fixes --------- Co-authored-by: Malte Sander <[email protected]>
1 parent 8c113ac commit 828fcad

File tree

6 files changed

+136
-13
lines changed

6 files changed

+136
-13
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,19 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- spark-connect-client: A new image for Spark connect tests and demos ([#1034])
10+
11+
### Changed
12+
13+
- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])
14+
715
### Fixed
816

917
- spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).
1018

19+
[#1034]: https://github.com/stackabletech/docker-images/pull/1034
1120
[#1042]: https://github.com/stackabletech/docker-images/pull/1042
1221

1322
## [25.3.0] - 2025-03-21

conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
zookeeper = importlib.import_module("zookeeper.versions")
3737
tools = importlib.import_module("tools.versions")
3838
statsd_exporter = importlib.import_module("statsd_exporter.versions")
39+
spark_connect_client = importlib.import_module("spark-connect-client.versions")
3940

4041
products = [
4142
{"name": "airflow", "versions": airflow.versions},
@@ -64,6 +65,7 @@
6465
{"name": "zookeeper", "versions": zookeeper.versions},
6566
{"name": "tools", "versions": tools.versions},
6667
{"name": "statsd_exporter", "versions": statsd_exporter.versions},
68+
{"name": "spark-connect-client", "versions": spark_connect_client.versions},
6769
]
6870

6971
open_shift_projects = {

spark-connect-client/Dockerfile

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5
2+
3+
# spark-builder: provides client libs for spark-connect
4+
FROM stackable/image/spark-k8s AS spark-builder
5+
6+
FROM stackable/image/java-base
7+
8+
ARG PRODUCT
9+
ARG PYTHON
10+
ARG RELEASE
11+
ARG STACKABLE_USER_UID
12+
13+
LABEL name="Stackable Spark Connect Examples" \
14+
maintainer="[email protected]" \
15+
vendor="Stackable GmbH" \
16+
version="${PRODUCT}" \
17+
release="${RELEASE}" \
18+
summary="Spark Connect Examples" \
19+
description="Spark Connect client libraries for Python and the JVM, including some examples."
20+
21+
22+
ENV HOME=/stackable
23+
24+
COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
25+
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect
26+
27+
RUN <<EOF
28+
microdnf update
29+
# python{version}-setuptools: needed to build the pyspark[connect] package
30+
microdnf install --nodocs \
31+
"python${PYTHON}" \
32+
"python${PYTHON}-pip" \
33+
"python${PYTHON}-setuptools"
34+
microdnf clean all
35+
rm -rf /var/cache/yum
36+
37+
ln -s /usr/bin/python${PYTHON} /usr/bin/python
38+
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
39+
40+
# Install python libraries for the spark connect client
41+
# shellcheck disable=SC2102
42+
pip install --no-cache-dir pyspark[connect]==${PRODUCT}
43+
44+
# All files and folders owned by root group to support running as arbitrary users.
45+
# This is best practice as all container users will belong to the root group (0).
46+
chown -R ${STACKABLE_USER_UID}:0 /stackable
47+
chmod -R g=u /stackable
48+
EOF
49+
50+
# ----------------------------------------
51+
# Attention: We are changing the group of all files in /stackable directly above
52+
# If you do any file based actions (copying / creating etc.) below this comment you
53+
# absolutely need to make sure that the correct permissions are applied!
54+
# chown ${STACKABLE_USER_UID}:0
55+
# ----------------------------------------
56+
57+
USER ${STACKABLE_USER_UID}
58+
59+
WORKDIR /stackable/spark-connect-examples/python
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
3+
from pyspark.sql import SparkSession
4+
5+
if __name__ == "__main__":
6+
remote: str = sys.argv[1]
7+
spark = (
8+
SparkSession.builder.appName("SimpleSparkConnectApp")
9+
.remote(remote)
10+
.getOrCreate()
11+
)
12+
13+
# See https://issues.apache.org/jira/browse/SPARK-46032
14+
spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar")
15+
16+
logFile = "/stackable/spark/README.md"
17+
logData = spark.read.text(logFile).cache()
18+
19+
numAs = logData.filter(logData.value.contains("a")).count()
20+
numBs = logData.filter(logData.value.contains("b")).count()
21+
22+
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
23+
24+
spark.stop()

spark-connect-client/versions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
versions = [
2+
{
3+
"product": "3.5.5",
4+
"spark-k8s": "3.5.5",
5+
"java-base": "17",
6+
"python": "3.11",
7+
},
8+
]

spark-k8s/Dockerfile

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -235,15 +235,27 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
235235
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
236236
./
237237

238-
WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
238+
WORKDIR /stackable/spark-${PRODUCT}/dist/connect
239+
240+
# As of version 3.5.5, spark-connect jars are not included in the dist folder.
241+
# To avoid classpath conflicts with existing spark applications,
242+
# we create a new dist/connect folder, and copy them here.
243+
RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
244+
&& cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
245+
&& cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
239246

240247
COPY spark-k8s/stackable/jmx /stackable/jmx
241248

249+
WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
250+
242251
RUN <<EOF
243252
# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
244-
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
245-
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar
246-
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
253+
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
254+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
255+
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
256+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
257+
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
258+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
247259

248260
# Get the correct `tini` binary for our architecture.
249261
curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
@@ -255,14 +267,13 @@ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_pr
255267
-o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
256268
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
257269

258-
# Symlink example jar, so that we can easily use it in tests
259-
ln -s /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples_*.jar /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples.jar
260-
261270
chmod -R g=u /stackable/spark-${PRODUCT}/dist
262271
chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
263272
chmod -R g=u /stackable/jmx
264273
EOF
265274

275+
# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
276+
# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
266277
FROM stackable/image/java-base AS final
267278

268279
ARG PRODUCT
@@ -282,7 +293,9 @@ LABEL name="Apache Spark" \
282293

283294
ENV HOME=/stackable
284295
ENV SPARK_HOME=/stackable/spark
285-
ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
296+
# Override the java-base version of JAVA_HOME to point to the jdk.
297+
ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
298+
ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
286299
ENV PYSPARK_PYTHON=/usr/bin/python
287300
ENV PYTHONPATH=$SPARK_HOME/python
288301

@@ -297,24 +310,32 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
297310

298311
RUN <<EOF
299312
microdnf update
300-
# procps: required for spark startup scripts
301-
# java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
302-
# Copying just the binaries from the builder stage failed.
303-
microdnf install \
313+
314+
# procps:
315+
# Required for spark startup scripts.
316+
# temurin-{version}-jdk:
317+
# Needed by the Spark UI to display process information using "jps" and "jmap".
318+
# Spark-Connect needs "javac" to compile auto-generated classes on the fly.
319+
microdnf install --nodocs \
304320
gzip \
305321
hostname \
306322
procps \
307323
"python${PYTHON}" \
308324
"python${PYTHON}-pip" \
309325
zip \
310-
"java-${JAVA_VERSION}-openjdk-devel"
326+
"temurin-${JAVA_VERSION}-jdk"
311327
microdnf clean all
312328
rm -rf /var/cache/yum
313329

314330
ln -s /usr/bin/python${PYTHON} /usr/bin/python
315331
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
332+
333+
# Symlink example jar, so that we can easily use it in tests
334+
ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
335+
chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
316336
EOF
317337

338+
318339
# ----------------------------------------
319340
# Attention:
320341
# If you do any file based actions (copying / creating etc.) below this comment you

0 commit comments

Comments
 (0)