Skip to content

Commit 9a11be2

Browse files
committed
Merge remote-tracking branch 'origin/main' into fix/nifi-reduce-image-size
2 parents 1741c17 + c654ba8 commit 9a11be2

File tree

9 files changed

+165
-54
lines changed

9 files changed

+165
-54
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,24 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- spark-connect-client: A new image for Spark connect tests and demos ([#1034])
10+
11+
### Changed
12+
13+
- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])
14+
715
### Fixed
816

917
- nifi: reduce docker image size by removing the recursive chown/chmods in the final image ([#1027]).
18+
- spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).
19+
- Add `--locked` flag to `cargo install` commands for reproducible builds ([#1044]).
1020

1121
[#1027]: https://github.com/stackabletech/docker-images/pull/1027
22+
[#1034]: https://github.com/stackabletech/docker-images/pull/1034
23+
[#1042]: https://github.com/stackabletech/docker-images/pull/1042
24+
[#1044]: https://github.com/stackabletech/docker-images/pull/1044
1225

1326
## [25.3.0] - 2025-03-21
1427

conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
zookeeper = importlib.import_module("zookeeper.versions")
3737
tools = importlib.import_module("tools.versions")
3838
statsd_exporter = importlib.import_module("statsd_exporter.versions")
39+
spark_connect_client = importlib.import_module("spark-connect-client.versions")
3940

4041
products = [
4142
{"name": "airflow", "versions": airflow.versions},
@@ -64,6 +65,7 @@
6465
{"name": "zookeeper", "versions": zookeeper.versions},
6566
{"name": "tools", "versions": tools.versions},
6667
{"name": "statsd_exporter", "versions": statsd_exporter.versions},
68+
{"name": "spark-connect-client", "versions": spark_connect_client.versions},
6769
]
6870

6971
open_shift_projects = {

spark-connect-client/Dockerfile

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5
2+
3+
# spark-builder: provides client libs for spark-connect
4+
FROM stackable/image/spark-k8s AS spark-builder
5+
6+
FROM stackable/image/java-base
7+
8+
ARG PRODUCT
9+
ARG PYTHON
10+
ARG RELEASE
11+
ARG STACKABLE_USER_UID
12+
13+
LABEL name="Stackable Spark Connect Examples" \
14+
maintainer="[email protected]" \
15+
vendor="Stackable GmbH" \
16+
version="${PRODUCT}" \
17+
release="${RELEASE}" \
18+
summary="Spark Connect Examples" \
19+
description="Spark Connect client libraries for Python and the JVM, including some examples."
20+
21+
22+
ENV HOME=/stackable
23+
24+
COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
25+
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect
26+
27+
RUN <<EOF
28+
microdnf update
29+
# python{version}-setuptools: needed to build the pyspark[connect] package
30+
microdnf install --nodocs \
31+
"python${PYTHON}" \
32+
"python${PYTHON}-pip" \
33+
"python${PYTHON}-setuptools"
34+
microdnf clean all
35+
rm -rf /var/cache/yum
36+
37+
ln -s /usr/bin/python${PYTHON} /usr/bin/python
38+
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
39+
40+
# Install python libraries for the spark connect client
41+
# shellcheck disable=SC2102
42+
pip install --no-cache-dir pyspark[connect]==${PRODUCT}
43+
44+
# All files and folders owned by root group to support running as arbitrary users.
45+
# This is best practice as all container users will belong to the root group (0).
46+
chown -R ${STACKABLE_USER_UID}:0 /stackable
47+
chmod -R g=u /stackable
48+
EOF
49+
50+
# ----------------------------------------
51+
# Attention: We are changing the group of all files in /stackable directly above
52+
# If you do any file based actions (copying / creating etc.) below this comment you
53+
# absolutely need to make sure that the correct permissions are applied!
54+
# chown ${STACKABLE_USER_UID}:0
55+
# ----------------------------------------
56+
57+
USER ${STACKABLE_USER_UID}
58+
59+
WORKDIR /stackable/spark-connect-examples/python
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
3+
from pyspark.sql import SparkSession
4+
5+
if __name__ == "__main__":
6+
remote: str = sys.argv[1]
7+
spark = (
8+
SparkSession.builder.appName("SimpleSparkConnectApp")
9+
.remote(remote)
10+
.getOrCreate()
11+
)
12+
13+
# See https://issues.apache.org/jira/browse/SPARK-46032
14+
spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar")
15+
16+
logFile = "/stackable/spark/README.md"
17+
logData = spark.read.text(logFile).cache()
18+
19+
numAs = logData.filter(logData.value.contains("a")).count()
20+
numBs = logData.filter(logData.value.contains("b")).count()
21+
22+
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
23+
24+
spark.stop()

spark-connect-client/versions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
versions = [
2+
{
3+
"product": "3.5.5",
4+
"spark-k8s": "3.5.5",
5+
"java-base": "17",
6+
"python": "3.11",
7+
},
8+
]

spark-k8s/Dockerfile

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ EOF
157157

158158

159159
# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
160-
# download additional JARs and perform checks, like log4shell check.
160+
# download additional JARs and perform checks
161161
FROM stackable/image/java-devel AS spark-builder
162162

163163
ARG PRODUCT
@@ -189,20 +189,15 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
189189
# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
190190
RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
191191
&& ./dev/make-distribution.sh \
192-
-Dhadoop.version="$HADOOP" \
193-
-Dmaven.test.skip=true \
194-
-DskipTests \
195-
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
196-
--no-transfer-progress \
197-
--batch-mode
192+
-Dhadoop.version="$HADOOP" \
193+
-Dmaven.test.skip=true \
194+
-DskipTests \
195+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
196+
--no-transfer-progress \
197+
--batch-mode
198198

199199
# <<< Build spark
200200

201-
# Get the correct `tini` binary for our architecture.
202-
RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
203-
&& chmod +x /usr/bin/tini
204-
205-
# We download these under dist so that log4shell checks them
206201
WORKDIR /stackable/spark-${PRODUCT}/dist/jars
207202

208203
# Copy modules required for s3a://
@@ -240,37 +235,45 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
240235
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
241236
./
242237

238+
WORKDIR /stackable/spark-${PRODUCT}/dist/connect
239+
240+
# As of version 3.5.5, spark-connect jars are not included in the dist folder.
241+
# To avoid classpath conflicts with existing spark applications,
242+
# we create a new dist/connect folder, and copy them here.
243+
RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
244+
&& cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
245+
&& cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
246+
247+
COPY spark-k8s/stackable/jmx /stackable/jmx
248+
243249
WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
244250

251+
RUN <<EOF
245252
# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
246-
RUN curl -O https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
247-
&& curl -O https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
248-
&& curl -O https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
253+
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
254+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
255+
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
256+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
257+
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
258+
-o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
249259

250-
WORKDIR /stackable/jmx
260+
# Get the correct `tini` binary for our architecture.
261+
curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
262+
-o /usr/bin/tini
263+
chmod +x /usr/bin/tini
251264

252-
RUN curl -O "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
265+
# JMX Exporter
266+
curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" \
267+
-o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
268+
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
253269

254-
# ===
255-
# Mitigation for CVE-2021-44228 (Log4Shell)
256-
#
257-
# For earlier versions this script removes the .class file that contains the
258-
# vulnerable code.
259-
# TODO: This can be restricted to target only versions which do not honor the environment
260-
# varible that has been set above but this has not currently been implemented
261-
COPY shared/log4shell.sh /bin
262-
RUN /bin/log4shell.sh /stackable/spark-${PRODUCT}/dist
263-
264-
# Ensure no vulnerable files are left over
265-
# This will currently report vulnerable files being present, as it also alerts on
266-
# SocketNode.class, which we do not remove with our scripts.
267-
# Further investigation will be needed whether this should also be removed.
268-
COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
269-
COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
270-
COPY shared/log4shell_scanner /bin/log4shell_scanner
271-
RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
272-
# ===
270+
chmod -R g=u /stackable/spark-${PRODUCT}/dist
271+
chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
272+
chmod -R g=u /stackable/jmx
273+
EOF
273274

275+
# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
276+
# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
274277
FROM stackable/image/java-base AS final
275278

276279
ARG PRODUCT
@@ -290,49 +293,51 @@ LABEL name="Apache Spark" \
290293

291294
ENV HOME=/stackable
292295
ENV SPARK_HOME=/stackable/spark
293-
ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
296+
# Override the java-base version of JAVA_HOME to point to the jdk.
297+
ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
298+
ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
294299
ENV PYSPARK_PYTHON=/usr/bin/python
295300
ENV PYTHONPATH=$SPARK_HOME/python
296301

297-
COPY spark-k8s/stackable /stackable
298-
COPY spark-k8s/licenses /licenses
299302

300303
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
301304
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
302305
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/jmx /stackable/jmx
303306
COPY --from=spark-builder /usr/bin/tini /usr/bin/tini
304307

308+
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/stackable/run-spark.sh /stackable/run-spark.sh
309+
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
310+
305311
RUN <<EOF
306312
microdnf update
307-
# procps: required for spark startup scripts
308-
# java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
309-
# Copying just the binaries from the builder stage failed.
310-
microdnf install \
313+
314+
# procps:
315+
# Required for spark startup scripts.
316+
# temurin-{version}-jdk:
317+
# Needed by the Spark UI to display process information using "jps" and "jmap".
318+
# Spark-Connect needs "javac" to compile auto-generated classes on the fly.
319+
microdnf install --nodocs \
311320
gzip \
312321
hostname \
313322
procps \
314323
"python${PYTHON}" \
315324
"python${PYTHON}-pip" \
316325
zip \
317-
"java-${JAVA_VERSION}-openjdk-devel"
326+
"temurin-${JAVA_VERSION}-jdk"
318327
microdnf clean all
319328
rm -rf /var/cache/yum
320329

321330
ln -s /usr/bin/python${PYTHON} /usr/bin/python
322331
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
323332

324-
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
325333
# Symlink example jar, so that we can easily use it in tests
326334
ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
327-
328-
# All files and folders owned by root group to support running as arbitrary users.
329-
# This is best practice as all container users will belong to the root group (0).
330-
chown -R ${STACKABLE_USER_UID}:0 /stackable
331-
chmod -R g=u /stackable
335+
chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
332336
EOF
333337

338+
334339
# ----------------------------------------
335-
# Attention: We are changing the group of all files in /stackable directly above
340+
# Attention:
336341
# If you do any file based actions (copying / creating etc.) below this comment you
337342
# absolutely need to make sure that the correct permissions are applied!
338343
# chown ${STACKABLE_USER_UID}:0

stackable-base/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ rm -rf /var/cache/yum
3636

3737
# WARNING (@NickLarsenNZ): We should pin the rustup version
3838
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain "$RUST_DEFAULT_TOOLCHAIN_VERSION"
39-
. "$HOME/.cargo/env" && cargo --quiet install cargo-cyclonedx@"$CARGO_CYCLONEDX_CRATE_VERSION" cargo-auditable@"$CARGO_AUDITABLE_CRATE_VERSION"
39+
. "$HOME/.cargo/env" && cargo --quiet install --locked cargo-cyclonedx@"$CARGO_CYCLONEDX_CRATE_VERSION" cargo-auditable@"$CARGO_AUDITABLE_CRATE_VERSION"
4040
EOF
4141

4242
FROM product-utils-builder AS config-utils

ubi8-rust-builder/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ WORKDIR /
8080
RUN <<EOF
8181
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain "$RUST_DEFAULT_TOOLCHAIN_VERSION"
8282
. "$HOME/.cargo/env"
83-
cargo --quiet install "cargo-cyclonedx@$CARGO_CYCLONEDX_CRATE_VERSION" "cargo-auditable@$CARGO_AUDITABLE_CRATE_VERSION"
83+
cargo --quiet install --locked "cargo-cyclonedx@$CARGO_CYCLONEDX_CRATE_VERSION" "cargo-auditable@$CARGO_AUDITABLE_CRATE_VERSION"
8484
EOF
8585

8686
# Build artifacts will be available in /app.

ubi9-rust-builder/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ WORKDIR /
7979
RUN <<EOF
8080
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain "$RUST_DEFAULT_TOOLCHAIN_VERSION"
8181
. "$HOME/.cargo/env"
82-
cargo install --quiet "cargo-cyclonedx@$CARGO_CYCLONEDX_CRATE_VERSION" "cargo-auditable@$CARGO_AUDITABLE_CRATE_VERSION"
82+
cargo install --quiet --locked "cargo-cyclonedx@$CARGO_CYCLONEDX_CRATE_VERSION" "cargo-auditable@$CARGO_AUDITABLE_CRATE_VERSION"
8383
EOF
8484

8585
# Build artifacts will be available in /app.

0 commit comments

Comments
 (0)