@@ -11,82 +11,100 @@ ARG TARGETARCH
1111ARG TARGETOS
1212ARG STACKABLE_USER_UID
1313
14- WORKDIR /stackable/jmx
15-
16- # The symlink from JMX Exporter 0.16.1 to the versionless link exists because old HDFS Operators (up until and including 23.7) used to hardcode
17- # the version of JMX Exporter like this: "-javaagent:/stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar"
18- # This is a TEMPORARY fix which means that we can keep the hardcoded path in HDFS operator FOR NOW as it will still point to a newer version of JMX Exporter, despite the "0.16.1" in the name.
19- # At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar.
20- # After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar)
21- # And then we can also remove the symlink to 0.16.1 from this Dockerfile.
22- RUN curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
23- chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
24- ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \
25- ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
26-
27- WORKDIR /stackable
28-
29- RUN ARCH="${TARGETARCH/amd64/x64}" && \
30- curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC . && \
31- ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
32-
3314# This Protobuf version is the exact version as used in the Hadoop Dockerfile
3415# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
3516# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
3617WORKDIR /opt/protobuf-src
37- RUN curl https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz && \
38- tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner && \
39- ./configure --prefix=/opt/protobuf && \
40- make "-j$(nproc)" && \
41- make install && \
18+ RUN <<EOF
19+ curl https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz
20+ tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner
21+ ./configure --prefix=/opt/protobuf
22+ make "-j$(nproc)"
23+ make install
4224 rm -rf /opt/protobuf-src
25+ EOF
4326
4427ENV PROTOBUF_HOME=/opt/protobuf
4528ENV PATH="${PATH}:/opt/protobuf/bin"
4629
47- RUN rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
48- RUN microdnf update && \
49- microdnf install \
50- # boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
51- boost1.78-devel && \
52- microdnf clean all && \
53- rm -rf /var/cache/yum
30+ RUN <<EOF
31+ rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
32+ microdnf update
33+ # boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
34+ microdnf install boost1.78-devel
35+ microdnf clean all
36+ rm -rf /var/cache/yum
37+ EOF
5438
5539WORKDIR /stackable
40+ RUN <<EOF
41+ # async-profiler
42+ ARCH="${TARGETARCH/amd64/x64}"
43+ curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC .
44+ ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
45+
46+ # JMX Exporter
47+ mkdir /stackable/jmx
48+ curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
49+ chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
50+ ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
51+ EOF
5652
57- COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches /stackable/patches
58-
53+ WORKDIR /build
54+ COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /build
55+ COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches /build/patches
56+ COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
5957# Hadoop Pipes requires libtirpc to build, whose headers are not packaged in RedHat UBI, so skip building this module
6058# Build from source to enable FUSE module, and to apply custom patches.
6159# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
6260# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
6361# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
64- RUN curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC . && \
65- patches/apply_patches.sh ${PRODUCT} && \
66- cd hadoop-${PRODUCT}-src && \
67- mvn --batch-mode --no-transfer-progress clean package -Pdist,native -pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' -Drequire.fuse=true -DskipTests -Dmaven.javadoc.skip=true && \
68- cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT} && \
69- mv hadoop-dist/target/bom.json /stackable/hadoop-${PRODUCT}/hadoop-${PRODUCT}.cdx.json && \
70- # HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
71- cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin && \
72- rm -rf /stackable/hadoop-${PRODUCT}-src
73-
74- # For earlier versions this script removes the .class file that contains the
75- # vulnerable code.
76- # TODO: This can be restricted to target only versions which do not honor the environment
77- # varible that has been set above but this has not currently been implemented
78- COPY shared/log4shell.sh /bin
79- RUN /bin/log4shell.sh "/stackable/hadoop-${PRODUCT}"
80-
81- # Ensure no vulnerable files are left over
82- # This will currently report vulnerable files being present, as it also alerts on
83- # SocketNode.class, which we do not remove with our scripts.
84- # Further investigation will be needed whether this should also be removed.
85- COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
86- COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
87- COPY shared/log4shell_scanner /bin/log4shell_scanner
88- RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}"
89- # ===
62+ RUN <<EOF
63+ curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC .
64+
65+ patches/apply_patches.sh ${PRODUCT}
66+ cd hadoop-${PRODUCT}-src
67+
68+ mvn \
69+ --batch-mode \
70+ --no-transfer-progress \
71+ clean package \
72+ -Pdist,native \
73+ -pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' \
74+ -Drequire.fuse=true \
75+ -DskipTests \
76+ -Dmaven.javadoc.skip=true
77+
78+ cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}
79+ mv hadoop-dist/target/bom.json /stackable/hadoop-${PRODUCT}/hadoop-${PRODUCT}.cdx.json
80+
81+ # HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
82+ cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin
83+ rm -rf /build/hadoop-${PRODUCT}-src
84+
85+ ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
86+
87+ mv /build/fuse_dfs_wrapper /stackable/hadoop/bin
88+
89+ # Remove unneeded binaries:
90+ # - code sources
91+ # - mapreduce/yarn binaries that were built as cross-project dependencies
92+ # - minicluster (only used for testing) and test .jars
93+ # - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
94+ rm -rf /stackable/hadoop/share/hadoop/common/sources/
95+ rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
96+ rm -rf /stackable/hadoop/share/hadoop/tools/sources/
97+ rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
98+ rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
99+ rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
100+ find /stackable -name 'hadoop-minicluster-*.jar' -type f -delete
101+ find /stackable -name 'hadoop-client-minicluster-*.jar' -type f -delete
102+ find /stackable -name 'hadoop-*tests.jar' -type f -delete
103+ rm -rf /stackable/.m2
104+
105+ # Set correct groups; make sure only required artifacts for the final image are located in /stackable
106+ chmod -R g=u /stackable
107+ EOF
90108
91109FROM stackable/image/java-devel AS hdfs-utils-builder
92110
@@ -99,26 +117,40 @@ ARG STACKABLE_USER_UID
99117# Dockerfile, which needs Java 11. So we need to also use the java-devel image in version 11 and
100118# install Java 17 ourselves.
101119# The adptiom yum repo is already added by the java-devel Dockerfile.
102- RUN microdnf update && \
103- microdnf install -y temurin-17-jdk && \
104- microdnf clean all && \
105- rm -rf /var/cache/yum
120+ RUN <<EOF
121+ microdnf update
122+ microdnf install -y temurin-17-jdk
123+ microdnf clean all
124+ rm -rf /var/cache/yum
125+ EOF
126+
106127ENV JAVA_HOME="/usr/lib/jvm/temurin-17-jdk"
107128
108129USER ${STACKABLE_USER_UID}
109- WORKDIR /stackable
110-
130+ WORKDIR /build
111131# The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
112132# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
113133# labels to build a rackID from.
114134# Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.
115-
116- RUN curl "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC . && \
117- cd hdfs-utils-${HDFS_UTILS} && \
118- mvn --batch-mode --no-transfer-progress clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \
119- mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \
120- cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \
121- rm -rf /stackable/hdfs-utils-main
135+ RUN <<EOF
136+ curl "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC .
137+ cd hdfs-utils-${HDFS_UTILS}
138+
139+ mvn \
140+ --batch-mode \
141+ --no-transfer-progress\
142+ clean package \
143+ -P hadoop-${PRODUCT} \
144+ -DskipTests \
145+ -Dmaven.javadoc.skip=true
146+
147+ mkdir -p /stackable
148+ cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hdfs-utils-${HDFS_UTILS}.jar
149+ rm -rf hdfs-utils-main
150+
151+ # Set correct groups
152+ chmod g=u /stackable/hdfs-utils-${HDFS_UTILS}.jar
153+ EOF
122154
123155FROM stackable/image/java-base AS final
124156
@@ -127,21 +159,19 @@ ARG RELEASE
127159ARG HDFS_UTILS
128160ARG STACKABLE_USER_UID
129161
130- LABEL name="Apache Hadoop" \
131- 132- vendor="Stackable GmbH" \
133- version="${PRODUCT}" \
134- release="${RELEASE}" \
135- summary="The Stackable image for Apache Hadoop." \
136- description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."
137-
138- COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}/
139- COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/jmx /stackable/jmx/
140- COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/async-profiler /stackable/async-profiler/
141- COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
142- COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /stackable/
143- COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
162+ LABEL \
163+ name="Apache Hadoop" \
164+ 165+ vendor="Stackable GmbH" \
166+ version="${PRODUCT}" \
167+ release="${RELEASE}" \
168+ summary="The Stackable image for Apache Hadoop." \
169+ description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."
170+
171+ COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable /stackable
172+ COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
144173
174+ COPY --chown=${STACKABLE_USER_UID}:0 hadoop/licenses /licenses
145175
146176# fuse is required for fusermount (called by fuse_dfs)
147177# fuse-libs is required for fuse_dfs (not included in fuse)
@@ -156,44 +186,31 @@ microdnf install \
156186 fuse-libs \
157187 tar
158188microdnf clean all
189+ rpm -qa --qf "%{NAME}-%{VERSION}-%{RELEASE}\n " | sort > /stackable/package_manifest.txt
190+ chown ${STACKABLE_USER_UID}:0 /stackable/package_manifest.txt
191+ chmod g=u /stackable/package_manifest.txt
159192rm -rf /var/cache/yum
160193
161- ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
162- mv /stackable/fuse_dfs_wrapper /stackable/hadoop/bin
163-
164- # Remove unneeded binaries:
165- # - code sources
166- # - mapreduce/yarn binaries that were built as cross-project dependencies
167- # - minicluster (only used for testing) and test .jars
168- # - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
169- rm -rf /stackable/hadoop/share/hadoop/common/sources/
170- rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
171- rm -rf /stackable/hadoop/share/hadoop/tools/sources/
172- rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
173- rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
174- rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
175- find . -name 'hadoop-minicluster-*.jar' -type f -delete
176- find . -name 'hadoop-client-minicluster-*.jar' -type f -delete
177- find . -name 'hadoop-*tests.jar' -type f -delete
178-
179194# Without this fuse_dfs does not work
180195# It is so non-root users (as we are) can mount a FUSE device and let other users access it
181196echo "user_allow_other" > /etc/fuse.conf
182-
183- # All files and folders owned by root group to support running as arbitrary users.
184- # This is best practice as all container users will belong to the root group (0).
185- chown -R ${STACKABLE_USER_UID}:0 /stackable
186- chmod -R g=u /stackable
187197EOF
188198
189- COPY hadoop/licenses /licenses
190-
191199# ----------------------------------------
192- # Attention: We are changing the group of all files in /stackable directly above
193- # If you do any file based actions (copying / creating etc.) below this comment you
194- # absolutely need to make sure that the correct permissions are applied!
195- # chown ${STACKABLE_USER_UID}:0
200+ # Checks
201+ # This section is to run final checks to ensure the created final images
202+ # adhere to several minimal requirements like:
203+ # - check file permissions and ownerships
204+ # ----------------------------------------
205+
206+ # Check that permissions and ownership in /stackable are set correctly
207+ # This will fail and stop the build if any mismatches are found.
208+ RUN <<EOF
209+ /bin/check-permissions-ownership.sh /stackable ${STACKABLE_USER_UID} 0
210+ EOF
211+
196212# ----------------------------------------
213+ # Attention: Do not perform any file based actions (copying/creating etc.) below this comment because the permissions would not be checked.
197214
198215USER ${STACKABLE_USER_UID}
199216
0 commit comments