From d2aaa3e9f45baef366b99073f08c4b0004050348 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Fri, 14 Mar 2025 17:31:54 +0100 Subject: [PATCH 1/7] reduce size --- hadoop/Dockerfile | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index acb4e0947..d2f1adc2c 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -69,7 +69,9 @@ RUN curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUC mv hadoop-dist/target/bom.json /stackable/hadoop-${PRODUCT}/hadoop-${PRODUCT}.cdx.json && \ # HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin && \ - rm -rf /stackable/hadoop-${PRODUCT}-src + rm -rf /stackable/hadoop-${PRODUCT}-src && \ + # Set correct group + chmod -R g=u /stackable # For earlier versions this script removes the .class file that contains the # vulnerable code. @@ -118,7 +120,9 @@ RUN curl "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_ mvn --batch-mode --no-transfer-progress clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \ mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \ cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \ - rm -rf /stackable/hdfs-utils-main + rm -rf /stackable/hdfs-utils-main && \ + # Set correct group + chmod -R g=u /stackable FROM stackable/image/java-base AS final @@ -128,12 +132,12 @@ ARG HDFS_UTILS ARG STACKABLE_USER_UID LABEL name="Apache Hadoop" \ - maintainer="info@stackable.tech" \ - vendor="Stackable GmbH" \ - version="${PRODUCT}" \ - release="${RELEASE}" \ - summary="The Stackable image for Apache Hadoop." \ - description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS." + maintainer="info@stackable.tech" \ + vendor="Stackable GmbH" \ + version="${PRODUCT}" \ + release="${RELEASE}" \ + summary="The Stackable image for Apache Hadoop." \ + description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS." COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}/ COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/jmx /stackable/jmx/ @@ -159,6 +163,9 @@ microdnf clean all rm -rf /var/cache/yum ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop + +# Set correct group +chmod -R g=u /stackable/fuse_dfs_wrapper mv /stackable/fuse_dfs_wrapper /stackable/hadoop/bin # Remove unneeded binaries: @@ -179,11 +186,6 @@ find . -name 'hadoop-*tests.jar' -type f -delete # Without this fuse_dfs does not work # It is so non-root users (as we are) can mount a FUSE device and let other users access it echo "user_allow_other" > /etc/fuse.conf - -# All files and folders owned by root group to support running as arbitrary users. -# This is best practice as all container users will belong to the root group (0). -chown -R ${STACKABLE_USER_UID}:0 /stackable -chmod -R g=u /stackable EOF COPY hadoop/licenses /licenses From c4ef1a2aba2db30329b3924605b7318da4136519 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 26 Mar 2025 15:09:47 +0100 Subject: [PATCH 2/7] adapt changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2897fdb86..61ea22b92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Fixed + +- hadoop: reduce docker image size by removing the recursive chown/chmods in the final image ([#1029]). + +[#1029]: https://github.com/stackabletech/docker-images/pull/1029 + ## [25.3.0] - 2025-03-21 ### Added From 1ca3c9795d7a8f8cd532b74eadbd5719fbf9c873 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 26 Mar 2025 15:26:33 +0100 Subject: [PATCH 3/7] remove log4shell & jmx softlink --- hadoop/Dockerfile | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index d2f1adc2c..b5dfb9ef2 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -13,16 +13,9 @@ ARG STACKABLE_USER_UID WORKDIR /stackable/jmx -# The symlink from JMX Exporter 0.16.1 to the versionless link exists because old HDFS Operators (up until and including 23.7) used to hardcode -# the version of JMX Exporter like this: "-javaagent:/stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar" -# This is a TEMPORARY fix which means that we can keep the hardcoded path in HDFS operator FOR NOW as it will still point to a newer version of JMX Exporter, despite the "0.16.1" in the name. -# At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar. -# After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar) -# And then we can also remove the symlink to 0.16.1 from this Dockerfile. RUN curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \ chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \ - ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \ - ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar + ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar WORKDIR /stackable @@ -73,23 +66,6 @@ RUN curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUC # Set correct group chmod -R g=u /stackable -# For earlier versions this script removes the .class file that contains the -# vulnerable code. -# TODO: This can be restricted to target only versions which do not honor the environment -# varible that has been set above but this has not currently been implemented -COPY shared/log4shell.sh /bin -RUN /bin/log4shell.sh "/stackable/hadoop-${PRODUCT}" - -# Ensure no vulnerable files are left over -# This will currently report vulnerable files being present, as it also alerts on -# SocketNode.class, which we do not remove with our scripts. -# Further investigation will be needed whether this should also be removed. -COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64 -COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64 -COPY shared/log4shell_scanner /bin/log4shell_scanner -RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}" -# === - FROM stackable/image/java-devel AS hdfs-utils-builder ARG HDFS_UTILS @@ -143,8 +119,10 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${P COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/jmx /stackable/jmx/ COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/async-profiler /stackable/async-profiler/ COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar + COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /stackable/ COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx +COPY --chown=${STACKABLE_USER_UID}:0 hadoop/licenses /licenses # fuse is required for fusermount (called by fuse_dfs) @@ -188,10 +166,8 @@ find . -name 'hadoop-*tests.jar' -type f -delete echo "user_allow_other" > /etc/fuse.conf EOF -COPY hadoop/licenses /licenses - # ---------------------------------------- -# Attention: We are changing the group of all files in /stackable directly above +# Attention: # If you do any file based actions (copying / creating etc.) below this comment you # absolutely need to make sure that the correct permissions are applied! # chown ${STACKABLE_USER_UID}:0 From 7d3529a8d58ccf394be9ce0a9741e3023871b537 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 26 Mar 2025 16:52:36 +0100 Subject: [PATCH 4/7] fixes & cleanup --- hadoop/Dockerfile | 189 ++++++++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 81 deletions(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index b5dfb9ef2..a8d918e28 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -11,60 +11,100 @@ ARG TARGETARCH ARG TARGETOS ARG STACKABLE_USER_UID -WORKDIR /stackable/jmx - -RUN curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \ - chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \ - ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar - -WORKDIR /stackable - -RUN ARCH="${TARGETARCH/amd64/x64}" && \ - curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC . && \ - ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler - # This Protobuf version is the exact version as used in the Hadoop Dockerfile # See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh # (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github) WORKDIR /opt/protobuf-src -RUN curl https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz && \ - tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner && \ - ./configure --prefix=/opt/protobuf && \ - make "-j$(nproc)" && \ - make install && \ +RUN < not sure @@ -140,27 +188,6 @@ microdnf install \ microdnf clean all rm -rf /var/cache/yum -ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop - -# Set correct group -chmod -R g=u /stackable/fuse_dfs_wrapper -mv /stackable/fuse_dfs_wrapper /stackable/hadoop/bin - -# Remove unneeded binaries: -# - code sources -# - mapreduce/yarn binaries that were built as cross-project dependencies -# - minicluster (only used for testing) and test .jars -# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610. -rm -rf /stackable/hadoop/share/hadoop/common/sources/ -rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/ -rm -rf /stackable/hadoop/share/hadoop/tools/sources/ -rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar -rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar -rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar -find . -name 'hadoop-minicluster-*.jar' -type f -delete -find . -name 'hadoop-client-minicluster-*.jar' -type f -delete -find . -name 'hadoop-*tests.jar' -type f -delete - # Without this fuse_dfs does not work # It is so non-root users (as we are) can mount a FUSE device and let other users access it echo "user_allow_other" > /etc/fuse.conf From 37941265f6cc82b640356d4f0b77f257eaa2d2ee Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 26 Mar 2025 16:54:03 +0100 Subject: [PATCH 5/7] fix linter --- hadoop/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index a8d918e28..ab575f736 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -31,7 +31,7 @@ RUN < Date: Wed, 26 Mar 2025 17:10:51 +0100 Subject: [PATCH 6/7] do not find in root --- hadoop/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index ab575f736..582533196 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -97,9 +97,9 @@ rm -rf /stackable/hadoop/share/hadoop/tools/sources/ rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar -find / -name 'hadoop-minicluster-*.jar' -type f -delete -find / -name 'hadoop-client-minicluster-*.jar' -type f -delete -find / -name 'hadoop-*tests.jar' -type f -delete +find /stackable -name 'hadoop-minicluster-*.jar' -type f -delete +find /stackable -name 'hadoop-client-minicluster-*.jar' -type f -delete +find /stackable -name 'hadoop-*tests.jar' -type f -delete rm -rf /stackable/.m2 # Set correct groups; make sure only required artifacts for the final image are located in /stackable From a91b84d0bbbfd9e132ca75eab52777fa4f421579 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 3 Apr 2025 22:04:57 +0200 Subject: [PATCH 7/7] add check permissions script --- hadoop/Dockerfile | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/hadoop/Dockerfile b/hadoop/Dockerfile index 582533196..30f129944 100644 --- a/hadoop/Dockerfile +++ b/hadoop/Dockerfile @@ -186,6 +186,9 @@ microdnf install \ fuse-libs \ tar microdnf clean all +rpm -qa --qf "%{NAME}-%{VERSION}-%{RELEASE}\n" | sort > /stackable/package_manifest.txt +chown ${STACKABLE_USER_UID}:0 /stackable/package_manifest.txt +chmod g=u /stackable/package_manifest.txt rm -rf /var/cache/yum # Without this fuse_dfs does not work @@ -194,12 +197,21 @@ echo "user_allow_other" > /etc/fuse.conf EOF # ---------------------------------------- -# Attention: -# If you do any file based actions (copying / creating etc.) below this comment you -# absolutely need to make sure that the correct permissions are applied! -# chown ${STACKABLE_USER_UID}:0 +# Checks +# This section is to run final checks to ensure the created final images +# adhere to several minimal requirements like: +# - check file permissions and ownerships # ---------------------------------------- +# Check that permissions and ownership in /stackable are set correctly +# This will fail and stop the build if any mismatches are found. +RUN <