Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ All notable changes to this project will be documented in this file.

### Added

- hadoop: check for correct permissions and ownerships in /stackable folder via
`check-permissions-ownership.sh` provided in stackable-base image ([#1029]).
- spark-connect-client: A new image for Spark connect tests and demos ([#1034])
- nifi: check for correct permissions and ownerships in /stackable folder via
`check-permissions-ownership.sh` provided in stackable-base image ([#1027]).
Expand All @@ -16,11 +18,13 @@ All notable changes to this project will be documented in this file.

### Fixed

- hadoop: reduce docker image size by removing the recursive chown/chmods in the final image ([#1029]).
- nifi: reduce docker image size by removing the recursive chown/chmods in the final image ([#1027]).
- spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).
- Add `--locked` flag to `cargo install` commands for reproducible builds ([#1044]).

[#1027]: https://github.com/stackabletech/docker-images/pull/1027
[#1029]: https://github.com/stackabletech/docker-images/pull/1029
[#1034]: https://github.com/stackabletech/docker-images/pull/1034
[#1042]: https://github.com/stackabletech/docker-images/pull/1042
[#1044]: https://github.com/stackabletech/docker-images/pull/1044
Expand Down
247 changes: 132 additions & 115 deletions hadoop/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,82 +11,100 @@ ARG TARGETARCH
ARG TARGETOS
ARG STACKABLE_USER_UID

WORKDIR /stackable/jmx

# The symlink from JMX Exporter 0.16.1 to the versionless link exists because old HDFS Operators (up until and including 23.7) used to hardcode
# the version of JMX Exporter like this: "-javaagent:/stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar"
# This is a TEMPORARY fix which means that we can keep the hardcoded path in HDFS operator FOR NOW as it will still point to a newer version of JMX Exporter, despite the "0.16.1" in the name.
# At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar.
# After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar)
# And then we can also remove the symlink to 0.16.1 from this Dockerfile.
RUN curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \
ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar

WORKDIR /stackable

RUN ARCH="${TARGETARCH/amd64/x64}" && \
curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC . && \
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler

# This Protobuf version is the exact version as used in the Hadoop Dockerfile
# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
WORKDIR /opt/protobuf-src
RUN curl https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz && \
tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner && \
./configure --prefix=/opt/protobuf && \
make "-j$(nproc)" && \
make install && \
RUN <<EOF
curl https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz
tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner
./configure --prefix=/opt/protobuf
make "-j$(nproc)"
make install
rm -rf /opt/protobuf-src
EOF

ENV PROTOBUF_HOME=/opt/protobuf
ENV PATH="${PATH}:/opt/protobuf/bin"

RUN rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
RUN microdnf update && \
microdnf install \
# boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
boost1.78-devel && \
microdnf clean all && \
rm -rf /var/cache/yum
RUN <<EOF
rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
microdnf update
# boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
microdnf install boost1.78-devel
microdnf clean all
rm -rf /var/cache/yum
EOF

WORKDIR /stackable
RUN <<EOF
# async-profiler
ARCH="${TARGETARCH/amd64/x64}"
curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC .
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler

# JMX Exporter
mkdir /stackable/jmx
curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
EOF

COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches /stackable/patches

WORKDIR /build
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /build
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches /build/patches
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
# Hadoop Pipes requires libtirpc to build, whose headers are not packaged in RedHat UBI, so skip building this module
# Build from source to enable FUSE module, and to apply custom patches.
# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
RUN curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC . && \
patches/apply_patches.sh ${PRODUCT} && \
cd hadoop-${PRODUCT}-src && \
mvn --batch-mode --no-transfer-progress clean package -Pdist,native -pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' -Drequire.fuse=true -DskipTests -Dmaven.javadoc.skip=true && \
cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT} && \
mv hadoop-dist/target/bom.json /stackable/hadoop-${PRODUCT}/hadoop-${PRODUCT}.cdx.json && \
# HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin && \
rm -rf /stackable/hadoop-${PRODUCT}-src

# For earlier versions this script removes the .class file that contains the
# vulnerable code.
# TODO: This can be restricted to target only versions which do not honor the environment
# varible that has been set above but this has not currently been implemented
COPY shared/log4shell.sh /bin
RUN /bin/log4shell.sh "/stackable/hadoop-${PRODUCT}"

# Ensure no vulnerable files are left over
# This will currently report vulnerable files being present, as it also alerts on
# SocketNode.class, which we do not remove with our scripts.
# Further investigation will be needed whether this should also be removed.
COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
COPY shared/log4shell_scanner /bin/log4shell_scanner
RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}"
# ===
RUN <<EOF
curl "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC .

patches/apply_patches.sh ${PRODUCT}
cd hadoop-${PRODUCT}-src

mvn \
--batch-mode \
--no-transfer-progress \
clean package \
-Pdist,native \
-pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' \
-Drequire.fuse=true \
-DskipTests \
-Dmaven.javadoc.skip=true

cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}
mv hadoop-dist/target/bom.json /stackable/hadoop-${PRODUCT}/hadoop-${PRODUCT}.cdx.json

# HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin
rm -rf /build/hadoop-${PRODUCT}-src

ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop

mv /build/fuse_dfs_wrapper /stackable/hadoop/bin

# Remove unneeded binaries:
# - code sources
# - mapreduce/yarn binaries that were built as cross-project dependencies
# - minicluster (only used for testing) and test .jars
# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
rm -rf /stackable/hadoop/share/hadoop/common/sources/
rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
rm -rf /stackable/hadoop/share/hadoop/tools/sources/
rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
find /stackable -name 'hadoop-minicluster-*.jar' -type f -delete
find /stackable -name 'hadoop-client-minicluster-*.jar' -type f -delete
find /stackable -name 'hadoop-*tests.jar' -type f -delete
rm -rf /stackable/.m2

# Set correct groups; make sure only required artifacts for the final image are located in /stackable
chmod -R g=u /stackable
EOF

FROM stackable/image/java-devel AS hdfs-utils-builder

Expand All @@ -99,26 +117,40 @@ ARG STACKABLE_USER_UID
# Dockerfile, which needs Java 11. So we need to also use the java-devel image in version 11 and
# install Java 17 ourselves.
# The adptiom yum repo is already added by the java-devel Dockerfile.
RUN microdnf update && \
microdnf install -y temurin-17-jdk && \
microdnf clean all && \
rm -rf /var/cache/yum
RUN <<EOF
microdnf update
microdnf install -y temurin-17-jdk
microdnf clean all
rm -rf /var/cache/yum
EOF

ENV JAVA_HOME="/usr/lib/jvm/temurin-17-jdk"

USER ${STACKABLE_USER_UID}
WORKDIR /stackable

WORKDIR /build
# The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
# labels to build a rackID from.
# Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.

RUN curl "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC . && \
cd hdfs-utils-${HDFS_UTILS} && \
mvn --batch-mode --no-transfer-progress clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \
mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \
cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \
rm -rf /stackable/hdfs-utils-main
RUN <<EOF
curl "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC .
cd hdfs-utils-${HDFS_UTILS}

mvn \
--batch-mode \
--no-transfer-progress\
clean package \
-P hadoop-${PRODUCT} \
-DskipTests \
-Dmaven.javadoc.skip=true

mkdir -p /stackable
cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hdfs-utils-${HDFS_UTILS}.jar
rm -rf hdfs-utils-main

# Set correct groups
chmod g=u /stackable/hdfs-utils-${HDFS_UTILS}.jar
EOF

FROM stackable/image/java-base AS final

Expand All @@ -127,21 +159,19 @@ ARG RELEASE
ARG HDFS_UTILS
ARG STACKABLE_USER_UID

LABEL name="Apache Hadoop" \
maintainer="[email protected]" \
vendor="Stackable GmbH" \
version="${PRODUCT}" \
release="${RELEASE}" \
summary="The Stackable image for Apache Hadoop." \
description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."

COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}/
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/jmx /stackable/jmx/
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/async-profiler /stackable/async-profiler/
COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /stackable/
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
LABEL \
name="Apache Hadoop" \
maintainer="[email protected]" \
vendor="Stackable GmbH" \
version="${PRODUCT}" \
release="${RELEASE}" \
summary="The Stackable image for Apache Hadoop." \
description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."

COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable /stackable
COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar

COPY --chown=${STACKABLE_USER_UID}:0 hadoop/licenses /licenses

# fuse is required for fusermount (called by fuse_dfs)
# fuse-libs is required for fuse_dfs (not included in fuse)
Expand All @@ -156,44 +186,31 @@ microdnf install \
fuse-libs \
tar
microdnf clean all
rpm -qa --qf "%{NAME}-%{VERSION}-%{RELEASE}\n" | sort > /stackable/package_manifest.txt
chown ${STACKABLE_USER_UID}:0 /stackable/package_manifest.txt
chmod g=u /stackable/package_manifest.txt
rm -rf /var/cache/yum

ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
mv /stackable/fuse_dfs_wrapper /stackable/hadoop/bin

# Remove unneeded binaries:
# - code sources
# - mapreduce/yarn binaries that were built as cross-project dependencies
# - minicluster (only used for testing) and test .jars
# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
rm -rf /stackable/hadoop/share/hadoop/common/sources/
rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
rm -rf /stackable/hadoop/share/hadoop/tools/sources/
rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
find . -name 'hadoop-minicluster-*.jar' -type f -delete
find . -name 'hadoop-client-minicluster-*.jar' -type f -delete
find . -name 'hadoop-*tests.jar' -type f -delete

# Without this fuse_dfs does not work
# It is so non-root users (as we are) can mount a FUSE device and let other users access it
echo "user_allow_other" > /etc/fuse.conf

# All files and folders owned by root group to support running as arbitrary users.
# This is best practice as all container users will belong to the root group (0).
chown -R ${STACKABLE_USER_UID}:0 /stackable
chmod -R g=u /stackable
EOF

COPY hadoop/licenses /licenses

# ----------------------------------------
# Attention: We are changing the group of all files in /stackable directly above
# If you do any file based actions (copying / creating etc.) below this comment you
# absolutely need to make sure that the correct permissions are applied!
# chown ${STACKABLE_USER_UID}:0
# Checks
# This section is to run final checks to ensure the created final images
# adhere to several minimal requirements like:
# - check file permissions and ownerships
# ----------------------------------------

# Check that permissions and ownership in /stackable are set correctly
# This will fail and stop the build if any mismatches are found.
RUN <<EOF
/bin/check-permissions-ownership.sh /stackable ${STACKABLE_USER_UID} 0
EOF

# ----------------------------------------
# Attention: Do not perform any file based actions (copying/creating etc.) below this comment because the permissions would not be checked.

USER ${STACKABLE_USER_UID}

Expand Down