Skip to content

Commit df556a5

Browse files
committed
Merge branch 'main' of https://github.com/stackabletech/docker-images into feat/move-patch-apply-logic-to-patchable
2 parents e17b727 + 828fcad commit df556a5

File tree

6 files changed

+164
-51
lines changed

6 files changed

+164
-51
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,21 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- spark-connect-client: A new image for Spark connect tests and demos ([#1034])
10+
11+
### Changed
12+
13+
- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])
14+
15+
### Fixed
16+
17+
- spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).
18+
19+
[#1034]: https://github.com/stackabletech/docker-images/pull/1034
20+
[#1042]: https://github.com/stackabletech/docker-images/pull/1042
21+
722
## [25.3.0] - 2025-03-21
823

924
### Added

conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
zookeeper = importlib.import_module("zookeeper.versions")
3838
tools = importlib.import_module("tools.versions")
3939
statsd_exporter = importlib.import_module("statsd_exporter.versions")
40+
spark_connect_client = importlib.import_module("spark-connect-client.versions")
4041

4142
products = [
4243
{"name": "airflow", "versions": airflow.versions},
@@ -66,6 +67,7 @@
6667
{"name": "zookeeper", "versions": zookeeper.versions},
6768
{"name": "tools", "versions": tools.versions},
6869
{"name": "statsd_exporter", "versions": statsd_exporter.versions},
70+
{"name": "spark-connect-client", "versions": spark_connect_client.versions},
6971
]
7072

7173
open_shift_projects = {

spark-connect-client/Dockerfile

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5
2+
3+
# spark-builder: provides client libs for spark-connect
4+
FROM stackable/image/spark-k8s AS spark-builder
5+
6+
FROM stackable/image/java-base
7+
8+
ARG PRODUCT
9+
ARG PYTHON
10+
ARG RELEASE
11+
ARG STACKABLE_USER_UID
12+
13+
LABEL name="Stackable Spark Connect Examples" \
14+
maintainer="[email protected]" \
15+
vendor="Stackable GmbH" \
16+
version="${PRODUCT}" \
17+
release="${RELEASE}" \
18+
summary="Spark Connect Examples" \
19+
description="Spark Connect client libraries for Python and the JVM, including some examples."
20+
21+
22+
ENV HOME=/stackable
23+
24+
COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
25+
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect
26+
27+
RUN <<EOF
28+
microdnf update
29+
# python{version}-setuptools: needed to build the pyspark[connect] package
30+
microdnf install --nodocs \
31+
"python${PYTHON}" \
32+
"python${PYTHON}-pip" \
33+
"python${PYTHON}-setuptools"
34+
microdnf clean all
35+
rm -rf /var/cache/yum
36+
37+
ln -s /usr/bin/python${PYTHON} /usr/bin/python
38+
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
39+
40+
# Install python libraries for the spark connect client
41+
# shellcheck disable=SC2102
42+
pip install --no-cache-dir pyspark[connect]==${PRODUCT}
43+
44+
# All files and folders owned by root group to support running as arbitrary users.
45+
# This is best practice as all container users will belong to the root group (0).
46+
chown -R ${STACKABLE_USER_UID}:0 /stackable
47+
chmod -R g=u /stackable
48+
EOF
49+
50+
# ----------------------------------------
51+
# Attention: We are changing the group of all files in /stackable directly above
52+
# If you do any file based actions (copying / creating etc.) below this comment you
53+
# absolutely need to make sure that the correct permissions are applied!
54+
# chown ${STACKABLE_USER_UID}:0
55+
# ----------------------------------------
56+
57+
USER ${STACKABLE_USER_UID}
58+
59+
WORKDIR /stackable/spark-connect-examples/python
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
3+
from pyspark.sql import SparkSession
4+
5+
if __name__ == "__main__":
6+
remote: str = sys.argv[1]
7+
spark = (
8+
SparkSession.builder.appName("SimpleSparkConnectApp")
9+
.remote(remote)
10+
.getOrCreate()
11+
)
12+
13+
# See https://issues.apache.org/jira/browse/SPARK-46032
14+
spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar")
15+
16+
logFile = "/stackable/spark/README.md"
17+
logData = spark.read.text(logFile).cache()
18+
19+
numAs = logData.filter(logData.value.contains("a")).count()
20+
numBs = logData.filter(logData.value.contains("b")).count()
21+
22+
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
23+
24+
spark.stop()

spark-connect-client/versions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
versions = [
2+
{
3+
"product": "3.5.5",
4+
"spark-k8s": "3.5.5",
5+
"java-base": "17",
6+
"python": "3.11",
7+
},
8+
]

spark-k8s/Dockerfile

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ EOF
107107

108108

109109
# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
110-
# download additional JARs and perform checks, like log4shell check.
110+
# download additional JARs and perform checks
111111
FROM stackable/image/java-devel AS spark-builder
112112

113113
ARG PRODUCT
@@ -139,20 +139,15 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
139139
# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
140140
RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
141141
&& ./dev/make-distribution.sh \
142-
-Dhadoop.version="$HADOOP" \
143-
-Dmaven.test.skip=true \
144-
-DskipTests \
145-
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
146-
--no-transfer-progress \
147-
--batch-mode
142+
-Dhadoop.version="$HADOOP" \
143+
-Dmaven.test.skip=true \
144+
-DskipTests \
145+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
146+
--no-transfer-progress \
147+
--batch-mode
148148

149149
# <<< Build spark
150150

151-
# Get the correct `tini` binary for our architecture.
152-
RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
153-
&& chmod +x /usr/bin/tini
154-
155-
# We download these under dist so that log4shell checks them
156151
WORKDIR /stackable/spark-${PRODUCT}/dist/jars
157152

158153
# Copy modules required for s3a://
@@ -190,37 +185,45 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
190185
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
191186
./
192187

188+
WORKDIR /stackable/spark-${PRODUCT}/dist/connect
189+
190+
# As of version 3.5.5, spark-connect jars are not included in the dist folder.
191+
# To avoid classpath conflicts with existing spark applications,
192+
# we create a new dist/connect folder, and copy them here.
193+
RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
194+
&& cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
195+
&& cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
196+
197+
COPY spark-k8s/stackable/jmx /stackable/jmx
198+
193199
WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
194200

201+
RUN <<EOF
195202
# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
196-
RUN curl -O https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
197-
&& curl -O https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
198-
&& curl -O https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
203+
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
204+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
205+
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
206+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
207+
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
208+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
199209

200-
WORKDIR /stackable/jmx
210+
# Get the correct `tini` binary for our architecture.
211+
curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
212+
-o /usr/bin/tini
213+
chmod +x /usr/bin/tini
201214

202-
RUN curl -O "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
215+
# JMX Exporter
216+
curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" \
217+
-o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
218+
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
203219

204-
# ===
205-
# Mitigation for CVE-2021-44228 (Log4Shell)
206-
#
207-
# For earlier versions this script removes the .class file that contains the
208-
# vulnerable code.
209-
# TODO: This can be restricted to target only versions which do not honor the environment
210-
# varible that has been set above but this has not currently been implemented
211-
COPY shared/log4shell.sh /bin
212-
RUN /bin/log4shell.sh /stackable/spark-${PRODUCT}/dist
213-
214-
# Ensure no vulnerable files are left over
215-
# This will currently report vulnerable files being present, as it also alerts on
216-
# SocketNode.class, which we do not remove with our scripts.
217-
# Further investigation will be needed whether this should also be removed.
218-
COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
219-
COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
220-
COPY shared/log4shell_scanner /bin/log4shell_scanner
221-
RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
222-
# ===
220+
chmod -R g=u /stackable/spark-${PRODUCT}/dist
221+
chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
222+
chmod -R g=u /stackable/jmx
223+
EOF
223224

225+
# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
226+
# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
224227
FROM stackable/image/java-base AS final
225228

226229
ARG PRODUCT
@@ -240,49 +243,51 @@ LABEL name="Apache Spark" \
240243

241244
ENV HOME=/stackable
242245
ENV SPARK_HOME=/stackable/spark
243-
ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
246+
# Override the java-base version of JAVA_HOME to point to the jdk.
247+
ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
248+
ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
244249
ENV PYSPARK_PYTHON=/usr/bin/python
245250
ENV PYTHONPATH=$SPARK_HOME/python
246251

247-
COPY spark-k8s/stackable /stackable
248-
COPY spark-k8s/licenses /licenses
249252

250253
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
251254
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
252255
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/jmx /stackable/jmx
253256
COPY --from=spark-builder /usr/bin/tini /usr/bin/tini
254257

258+
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/stackable/run-spark.sh /stackable/run-spark.sh
259+
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
260+
255261
RUN <<EOF
256262
microdnf update
257-
# procps: required for spark startup scripts
258-
# java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
259-
# Copying just the binaries from the builder stage failed.
260-
microdnf install \
263+
264+
# procps:
265+
# Required for spark startup scripts.
266+
# temurin-{version}-jdk:
267+
# Needed by the Spark UI to display process information using "jps" and "jmap".
268+
# Spark-Connect needs "javac" to compile auto-generated classes on the fly.
269+
microdnf install --nodocs \
261270
gzip \
262271
hostname \
263272
procps \
264273
"python${PYTHON}" \
265274
"python${PYTHON}-pip" \
266275
zip \
267-
"java-${JAVA_VERSION}-openjdk-devel"
276+
"temurin-${JAVA_VERSION}-jdk"
268277
microdnf clean all
269278
rm -rf /var/cache/yum
270279

271280
ln -s /usr/bin/python${PYTHON} /usr/bin/python
272281
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
273282

274-
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
275283
# Symlink example jar, so that we can easily use it in tests
276284
ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
277-
278-
# All files and folders owned by root group to support running as arbitrary users.
279-
# This is best practice as all container users will belong to the root group (0).
280-
chown -R ${STACKABLE_USER_UID}:0 /stackable
281-
chmod -R g=u /stackable
285+
chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
282286
EOF
283287

288+
284289
# ----------------------------------------
285-
# Attention: We are changing the group of all files in /stackable directly above
290+
# Attention:
286291
# If you do any file based actions (copying / creating etc.) below this comment you
287292
# absolutely need to make sure that the correct permissions are applied!
288293
# chown ${STACKABLE_USER_UID}:0

0 commit comments

Comments
 (0)