107107
108108
109109# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
110- # download additional JARs and perform checks, like log4shell check.
110+ # download additional JARs and perform checks
111111FROM stackable/image/java-devel AS spark-builder
112112
113113ARG PRODUCT
@@ -139,20 +139,15 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
139139# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
140140RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
141141 && ./dev/make-distribution.sh \
142- -Dhadoop.version="$HADOOP" \
143- -Dmaven.test.skip=true \
144- -DskipTests \
145- -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
146- --no-transfer-progress \
147- --batch-mode
142+ -Dhadoop.version="$HADOOP" \
143+ -Dmaven.test.skip=true \
144+ -DskipTests \
145+ -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
146+ --no-transfer-progress \
147+ --batch-mode
148148
149149# <<< Build spark
150150
151- # Get the correct `tini` binary for our architecture.
152- RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
153- && chmod +x /usr/bin/tini
154-
155- # We download these under dist so that log4shell checks them
156151WORKDIR /stackable/spark-${PRODUCT}/dist/jars
157152
158153# Copy modules required for s3a://
@@ -190,37 +185,45 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
190185 /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
191186 ./
192187
188+ WORKDIR /stackable/spark-${PRODUCT}/dist/connect
189+
190+ # As of version 3.5.5, spark-connect jars are not included in the dist folder.
191+ # To avoid classpath conflicts with existing spark applications,
192+ # we create a new dist/connect folder, and copy them here.
193+ RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
194+ && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
195+ && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
196+
197+ COPY spark-k8s/stackable/jmx /stackable/jmx
198+
193199WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
194200
201+ RUN <<EOF
195202# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
196- RUN curl -O https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
197- && curl -O https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
198- && curl -O https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
203+ curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
204+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
205+ curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
206+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
207+ curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
208+ -o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
199209
200- WORKDIR /stackable/jmx
210+ # Get the correct `tini` binary for our architecture.
211+ curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
212+ -o /usr/bin/tini
213+ chmod +x /usr/bin/tini
201214
202- RUN curl -O "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
215+ # JMX Exporter
216+ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" \
217+ -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
218+ ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
203219
204- # ===
205- # Mitigation for CVE-2021-44228 (Log4Shell)
206- #
207- # For earlier versions this script removes the .class file that contains the
208- # vulnerable code.
209- # TODO: This can be restricted to target only versions which do not honor the environment
210- # varible that has been set above but this has not currently been implemented
211- COPY shared/log4shell.sh /bin
212- RUN /bin/log4shell.sh /stackable/spark-${PRODUCT}/dist
213-
214- # Ensure no vulnerable files are left over
215- # This will currently report vulnerable files being present, as it also alerts on
216- # SocketNode.class, which we do not remove with our scripts.
217- # Further investigation will be needed whether this should also be removed.
218- COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
219- COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
220- COPY shared/log4shell_scanner /bin/log4shell_scanner
221- RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
222- # ===
220+ chmod -R g=u /stackable/spark-${PRODUCT}/dist
221+ chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
222+ chmod -R g=u /stackable/jmx
223+ EOF
223224
225+ # TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
226+ # To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
224227FROM stackable/image/java-base AS final
225228
226229ARG PRODUCT
@@ -240,49 +243,51 @@ LABEL name="Apache Spark" \
240243
241244ENV HOME=/stackable
242245ENV SPARK_HOME=/stackable/spark
243- ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
246+ # Override the java-base version of JAVA_HOME to point to the jdk.
247+ ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
248+ ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
244249ENV PYSPARK_PYTHON=/usr/bin/python
245250ENV PYTHONPATH=$SPARK_HOME/python
246251
247- COPY spark-k8s/stackable /stackable
248- COPY spark-k8s/licenses /licenses
249252
250253COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
251254COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
252255COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/jmx /stackable/jmx
253256COPY --from=spark-builder /usr/bin/tini /usr/bin/tini
254257
258+ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/stackable/run-spark.sh /stackable/run-spark.sh
259+ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
260+
255261RUN <<EOF
256262microdnf update
257- # procps: required for spark startup scripts
258- # java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
259- # Copying just the binaries from the builder stage failed.
260- microdnf install \
263+
264+ # procps:
265+ # Required for spark startup scripts.
266+ # temurin-{version}-jdk:
267+ # Needed by the Spark UI to display process information using "jps" and "jmap".
268+ # Spark-Connect needs "javac" to compile auto-generated classes on the fly.
269+ microdnf install --nodocs \
261270 gzip \
262271 hostname \
263272 procps \
264273 "python${PYTHON}" \
265274 "python${PYTHON}-pip" \
266275 zip \
267- "java -${JAVA_VERSION}-openjdk-devel "
276+ "temurin -${JAVA_VERSION}-jdk "
268277microdnf clean all
269278rm -rf /var/cache/yum
270279
271280ln -s /usr/bin/python${PYTHON} /usr/bin/python
272281ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
273282
274- ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
275283# Symlink example jar, so that we can easily use it in tests
276284ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
277-
278- # All files and folders owned by root group to support running as arbitrary users.
279- # This is best practice as all container users will belong to the root group (0).
280- chown -R ${STACKABLE_USER_UID}:0 /stackable
281- chmod -R g=u /stackable
285+ chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
282286EOF
283287
288+
284289# ----------------------------------------
285- # Attention: We are changing the group of all files in /stackable directly above
290+ # Attention:
286291# If you do any file based actions (copying / creating etc.) below this comment you
287292# absolutely need to make sure that the correct permissions are applied!
288293# chown ${STACKABLE_USER_UID}:0
0 commit comments