157157
158158
159159# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
160- # download additional JARs and perform checks, like log4shell check.
160+ # download additional JARs and perform checks
161161FROM stackable/image/java-devel AS spark-builder
162162
163163ARG PRODUCT
@@ -189,20 +189,15 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
189189# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
190190RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
191191 && ./dev/make-distribution.sh \
192- -Dhadoop.version="$HADOOP" \
193- -Dmaven.test.skip=true \
194- -DskipTests \
195- -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
196- --no-transfer-progress \
197- --batch-mode
192+ -Dhadoop.version="$HADOOP" \
193+ -Dmaven.test.skip=true \
194+ -DskipTests \
195+ -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
196+ --no-transfer-progress \
197+ --batch-mode
198198
199199# <<< Build spark
200200
201- # Get the correct `tini` binary for our architecture.
202- RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
203- && chmod +x /usr/bin/tini
204-
205- # We download these under dist so that log4shell checks them
206201WORKDIR /stackable/spark-${PRODUCT}/dist/jars
207202
208203# Copy modules required for s3a://
@@ -240,37 +235,45 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
240235 /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
241236 ./
242237
238+ WORKDIR /stackable/spark-${PRODUCT}/dist/connect
239+
240+ # As of version 3.5.5, spark-connect jars are not included in the dist folder.
241+ # To avoid classpath conflicts with existing spark applications,
242+ # we create a new dist/connect folder, and copy them here.
243+ RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
244+ && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
245+ && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
246+
247+ COPY spark-k8s/stackable/jmx /stackable/jmx
248+
243249WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
244250
251+ RUN <<EOF
245252# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
246- RUN curl -O https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
247- && curl -O https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
248- && curl -O https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
253+ curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
254+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
255+ curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
256+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
257+ curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
258+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
249259
250- WORKDIR /stackable/jmx
260+ # Get the correct `tini` binary for our architecture.
261+ curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
262+ -o /usr/bin/tini
263+ chmod +x /usr/bin/tini
251264
252- RUN curl -O "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
265+ # JMX Exporter
266+ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" \
267+ -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
268+ ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
253269
254- # ===
255- # Mitigation for CVE-2021-44228 (Log4Shell)
256- #
257- # For earlier versions this script removes the .class file that contains the
258- # vulnerable code.
259- # TODO: This can be restricted to target only versions which do not honor the environment
260- # varible that has been set above but this has not currently been implemented
261- COPY shared/log4shell.sh /bin
262- RUN /bin/log4shell.sh /stackable/spark-${PRODUCT}/dist
263-
264- # Ensure no vulnerable files are left over
265- # This will currently report vulnerable files being present, as it also alerts on
266- # SocketNode.class, which we do not remove with our scripts.
267- # Further investigation will be needed whether this should also be removed.
268- COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
269- COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
270- COPY shared/log4shell_scanner /bin/log4shell_scanner
271- RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
272- # ===
270+ chmod -R g=u /stackable/spark-${PRODUCT}/dist
271+ chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
272+ chmod -R g=u /stackable/jmx
273+ EOF
273274
275+ # TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
276+ # To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
274277FROM stackable/image/java-base AS final
275278
276279ARG PRODUCT
@@ -290,49 +293,51 @@ LABEL name="Apache Spark" \
290293
291294ENV HOME=/stackable
292295ENV SPARK_HOME=/stackable/spark
293- ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
296+ # Override the java-base version of JAVA_HOME to point to the jdk.
297+ ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
298+ ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
294299ENV PYSPARK_PYTHON=/usr/bin/python
295300ENV PYTHONPATH=$SPARK_HOME/python
296301
297- COPY spark-k8s/stackable /stackable
298- COPY spark-k8s/licenses /licenses
299302
300303COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
301304COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
302305COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/jmx /stackable/jmx
303306COPY --from=spark-builder /usr/bin/tini /usr/bin/tini
304307
308+ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/stackable/run-spark.sh /stackable/run-spark.sh
309+ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
310+
305311RUN <<EOF
306312microdnf update
307- # procps: required for spark startup scripts
308- # java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
309- # Copying just the binaries from the builder stage failed.
310- microdnf install \
313+
314+ # procps:
315+ # Required for spark startup scripts.
316+ # temurin-{version}-jdk:
317+ # Needed by the Spark UI to display process information using "jps" and "jmap".
318+ # Spark-Connect needs "javac" to compile auto-generated classes on the fly.
319+ microdnf install --nodocs \
311320 gzip \
312321 hostname \
313322 procps \
314323 "python${PYTHON}" \
315324 "python${PYTHON}-pip" \
316325 zip \
317- "java -${JAVA_VERSION}-openjdk-devel "
326+ "temurin -${JAVA_VERSION}-jdk "
318327microdnf clean all
319328rm -rf /var/cache/yum
320329
321330ln -s /usr/bin/python${PYTHON} /usr/bin/python
322331ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
323332
324- ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
325333# Symlink example jar, so that we can easily use it in tests
326334ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
327-
328- # All files and folders owned by root group to support running as arbitrary users.
329- # This is best practice as all container users will belong to the root group (0).
330- chown -R ${STACKABLE_USER_UID}:0 /stackable
331- chmod -R g=u /stackable
335+ chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
332336EOF
333337
338+
334339# ----------------------------------------
335- # Attention: We are changing the group of all files in /stackable directly above
340+ # Attention:
336341# If you do any file based actions (copying / creating etc.) below this comment you
337342# absolutely need to make sure that the correct permissions are applied!
338343# chown ${STACKABLE_USER_UID}:0
0 commit comments