Skip to content

Commit 2e31788

Browse files
lfranckesbernauer
andauthored
Rework Hive image (#774)
* Rework Hive image - Move postgresql and logging dependencies into build of Hive itself - Backport all patches that have been backported on branch-3.1 upstream as well which affect the metastore - Update patch dependencies - Update Hadoop dependency to 3.3.6 * Remove postgres upload script * Update hive/Dockerfile Co-authored-by: Sebastian Bernauer <[email protected]> * Add comment * Update hive/Dockerfile Co-authored-by: Sebastian Bernauer <[email protected]> * README explaining the process the patches were created. * fix Python lint on comment * fix Python lint on comment --------- Co-authored-by: Sebastian Bernauer <[email protected]> Co-authored-by: Sebastian Bernauer <[email protected]>
1 parent 9ad2a28 commit 2e31788

16 files changed

+2265
-199
lines changed

hive/Dockerfile

Lines changed: 82 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2,124 +2,120 @@
22

33
FROM stackable/image/hadoop AS hadoop-builder
44

5-
FROM stackable/image/java-devel AS builder
5+
FROM stackable/image/java-devel AS hive-builder
66

7-
# Apache Hive up t0 4.x(!) officially requires Java 8 (there is no distincion between building and running). As of
8-
# 2024-04-15 we for sure need Java 8 for building, but we used a Java 11 runtime for months now without any problems.
7+
# Apache Hive up to 4.0.x(!) officially requires Java 8 (there is no distinction between building and running).
8+
# As of 2024-04-15 we for sure need Java 8 for building, but we used a Java 11 runtime for months now without any problems.
99
# As we got weird TLS errors (https://stackable-workspace.slack.com/archives/C031A5BEFS7/p1713185172557459) with a
10-
# Java 8 runtime we bumped the Runtime to Java 11 again. As we can only select a single version from the java-base
11-
# image, we pick 11 (which is used in the final image), and install Java 8 here.
10+
# Java 8 runtime we bumped the Runtime to Java 11 again.
1211

1312
ARG PRODUCT
1413
ARG HADOOP
1514
ARG JMX_EXPORTER
16-
ARG JACKSON_DATAFORMAT_XML
17-
ARG JACKSON_JAXB_ANNOTATIONS
18-
ARG POSTGRES_DRIVER
19-
ARG AWS_JAVA_SDK_BUNDLE
20-
ARG AZURE_STORAGE
21-
ARG AZURE_KEYVAULT_CORE
15+
16+
# Setting this to anything other than "true" will keep the cache folders around (e.g. for Maven, NPM etc.)
17+
# This can be used to speed up builds when disk space is of no concern.
18+
ARG DELETE_CACHES="true"
2219

2320
COPY --chown=stackable:stackable hive/stackable /stackable
2421

2522
USER stackable
2623
WORKDIR /stackable
2724

28-
RUN curl --fail -L "https://repo.stackable.tech/repository/packages/hive/apache-hive-${PRODUCT}-src.tar.gz" | tar -xzC . && \
29-
patches/apply_patches.sh ${PRODUCT} && \
30-
cd /stackable/apache-hive-${PRODUCT}-src/ && \
31-
mvn clean package -DskipTests --projects standalone-metastore && \
32-
mv standalone-metastore/target/apache-hive-metastore-${PRODUCT}-bin/apache-hive-metastore-${PRODUCT}-bin /stackable && \
33-
ln -s /stackable/apache-hive-metastore-${PRODUCT}-bin/ /stackable/hive-metastore && \
34-
cp /stackable/hive-metastore/bin/start-metastore /stackable/hive-metastore/bin/start-metastore.bak && \
35-
cp /stackable/bin/start-metastore /stackable/hive-metastore/bin && \
36-
rm -rf /stackable/apache-hive-${PRODUCT}-src
25+
# Cache mounts are owned by root by default
26+
# We need to explicitly give the uid to use which is hardcoded to "1000" in stackable-base
27+
RUN --mount=type=cache,id=maven-hive,uid=1000,target=/stackable/.m2/repository <<EOF
28+
curl --fail -L "https://repo.stackable.tech/repository/packages/hive/apache-hive-${PRODUCT}-src.tar.gz" | tar -xzC .
3729

38-
COPY --chown=stackable:stackable --from=hadoop-builder /stackable/hadoop /stackable/hadoop
30+
patches/apply_patches.sh ${PRODUCT}
3931

40-
# Add a PostgreSQL driver, as this is the primary used persistence
41-
RUN curl --fail -L https://repo.stackable.tech/repository/packages/pgjdbc/postgresql-${POSTGRES_DRIVER}.jar -o /stackable/hive-metastore/lib/postgresql-${POSTGRES_DRIVER}.jar
32+
cd /stackable/apache-hive-${PRODUCT}-src/
33+
mvn --batch-mode --no-transfer-progress clean package -DskipTests --projects standalone-metastore
34+
mv standalone-metastore/target/apache-hive-metastore-${PRODUCT}-bin/apache-hive-metastore-${PRODUCT}-bin /stackable
4235

43-
# The next two sections for S3 and Azure use hardcoded version numbers on purpose instead of wildcards
44-
# This way the build will fail should one of the files not be available anymore in a later Hadoop version!
36+
ln -s /stackable/apache-hive-metastore-${PRODUCT}-bin/ /stackable/hive-metastore
37+
cp /stackable/bin/start-metastore /stackable/hive-metastore/bin
38+
rm -rf /stackable/apache-hive-${PRODUCT}-src
4539

46-
# Add S3 Support for Hive (support for s3a://)
47-
RUN cp /stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar /stackable/hive-metastore/lib/
48-
RUN cp /stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar /stackable/hive-metastore/lib/
40+
curl --fail -L "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
41+
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
4942

50-
# Add Azure ABFS support (support for abfs://)
51-
RUN cp /stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar /stackable/hive-metastore/lib/
52-
RUN cp /stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar /stackable/hive-metastore/lib/
53-
RUN cp /stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar /stackable/hive-metastore/lib/
43+
# We're removing these to make the intermediate layer smaller
44+
# This can be necessary even though it's only a builder image because the GitHub Action Runners only have very limited space available
45+
# and we are sometimes running into errors because we're out of space.
46+
# Therefore, we try to clean up all layers as much as possible.
47+
if [ "${DELETE_CACHES}" = "true" ] ; then
48+
rm -rf /stackable/.m2/repository/*
49+
rm -rf /stackable/.npm/*
50+
rm -rf /stackable/.cache/*
51+
fi
52+
EOF
5453

55-
# The symlink from JMX Exporter 0.16.1 to the versionless link exists because old HDFS Operators (up until and including 23.7) used to hardcode
56-
# the version of JMX Exporter like this: "-javaagent:/stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar"
57-
# This is a TEMPORARY fix which means that we can keep the hardcoded path in HDFS operator FOR NOW as it will still point to a newer version of JMX Exporter, despite the "0.16.1" in the name.
58-
# At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar.
59-
# After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar)
60-
# And then we can also remove the symlink to 0.16.1 from this Dockerfile.
61-
RUN curl --fail -L "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
62-
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \
63-
ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
64-
65-
# Logging.
66-
# jackson-module-jaxb-annotations: this is no longer bundled with the hadoop-yarn/mapreduce libraries (excluded from the hadoop build).
67-
RUN rm /stackable/hive-metastore/lib/log4j-slf4j-impl* && \
68-
curl --fail -L https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar -o /stackable/hive-metastore/lib/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar && \
69-
curl --fail -L https://repo.stackable.tech/repository/packages/jackson-module-jaxb-annotations/jackson-module-jaxb-annotations-${JACKSON_JAXB_ANNOTATIONS}.jar -o /stackable/hive-metastore/lib/jackson-module-jaxb-annotations-${JACKSON_JAXB_ANNOTATIONS}.jar
70-
71-
# ===
72-
# For earlier versions this script removes the .class file that contains the
73-
# vulnerable code.
74-
# TODO: This can be restricted to target only versions which do not honor the environment
75-
# varible that has been set above but this has not currently been implemented
76-
COPY shared/log4shell.sh /bin
77-
RUN /bin/log4shell.sh /stackable/apache-hive-metastore-${PRODUCT}-bin/
78-
79-
# Ensure no vulnerable files are left over
80-
# This will currently report vulnerable files being present, as it also alerts on
81-
# SocketNode.class, which we do not remove with our scripts.
82-
# Further investigation will be needed whether this should also be removed.
83-
COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
84-
COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
85-
COPY shared/log4shell_scanner /bin/log4shell_scanner
86-
# log4shell_scanner does not work on symlinks!
87-
RUN /bin/log4shell_scanner s /stackable/apache-hive-metastore-${PRODUCT}-bin/
88-
# ===
89-
90-
# syntax=docker/dockerfile:1@sha256:ac85f380a63b13dfcefa89046420e1781752bab202122f8f50032edf31be0021
91-
FROM stackable/image/java-base
54+
55+
FROM stackable/image/java-base AS final
9256

9357
ARG PRODUCT
9458
ARG HADOOP
9559
ARG RELEASE
60+
ARG AWS_JAVA_SDK_BUNDLE
61+
ARG AZURE_STORAGE
62+
ARG AZURE_KEYVAULT_CORE
9663

97-
LABEL name="Apache Hive metastore" \
98-
maintainer="[email protected]" \
99-
vendor="Stackable GmbH" \
100-
version="${PRODUCT}" \
101-
release="${RELEASE}" \
102-
summary="The Stackable image for Apache Hive metastore." \
103-
description="This image is deployed by the Stackable Operator for Apache Hive."
10464

105-
RUN microdnf update && \
106-
microdnf clean all && \
107-
rpm -qa --qf "%{NAME}-%{VERSION}-%{RELEASE}\n" | sort > /stackable/package_manifest.txt && \
108-
rm -rf /var/cache/yum
65+
ARG NAME="Apache Hive metastore"
66+
ARG DESCRIPTION="This image is deployed by the Stackable Operator for Apache Hive."
67+
68+
LABEL name="Apache Hive metastore"
69+
LABEL version="${PRODUCT}"
70+
LABEL release="${RELEASE}"
71+
LABEL summary="The Stackable image for Apache Hive metastore."
72+
LABEL description="${DESCRIPTION}"
73+
74+
# https://github.com/opencontainers/image-spec/blob/036563a4a268d7c08b51a08f05a02a0fe74c7268/annotations.md#annotations
75+
LABEL org.opencontainers.image.documentation="https://docs.stackable.tech/home/stable/hive/"
76+
LABEL org.opencontainers.image.version="${PRODUCT}"
77+
LABEL org.opencontainers.image.revision="${RELEASE}"
78+
LABEL org.opencontainers.image.title="${NAME}"
79+
LABEL org.opencontainers.image.description="${DESCRIPTION}"
80+
81+
# https://docs.openshift.com/container-platform/4.16/openshift_images/create-images.html#defining-image-metadata
82+
# https://github.com/projectatomic/ContainerApplicationGenericLabels/blob/master/vendor/redhat/labels.md
83+
LABEL io.openshift.tags="ubi9,stackable,hive,sdp"
84+
LABEL io.k8s.description="${DESCRIPTION}"
85+
LABEL io.k8s.display-name="${NAME}"
86+
87+
RUN <<EOF
88+
microdnf update
89+
microdnf clean all
90+
rpm -qa --qf "%{NAME}-%{VERSION}-%{RELEASE}\n" | sort > /stackable/package_manifest.txt
91+
rm -rf /var/cache/yum
92+
EOF
10993

11094
USER stackable
11195
WORKDIR /stackable
11296

113-
# TODO: Try to use --link here, as it should be faster
114-
COPY --chown=stackable:stackable --from=builder /stackable/apache-hive-metastore-${PRODUCT}-bin /stackable/apache-hive-metastore-${PRODUCT}-bin
115-
RUN ln -s /stackable/apache-hive-metastore-${PRODUCT}-bin/ /stackable/hive-metastore
97+
COPY --chown=stackable:stackable --from=hive-builder /stackable/apache-hive-metastore-${PRODUCT}-bin /stackable/apache-hive-metastore-${PRODUCT}-bin
98+
RUN ln -s /stackable/apache-hive-metastore-${PRODUCT}-bin /stackable/hive-metastore
11699

117100
# It is useful to see which version of Hadoop is used at a glance
118101
# Therefore the use of the full name here
119-
COPY --chown=stackable:stackable --from=builder /stackable/hadoop /stackable/hadoop-${HADOOP}
120-
RUN ln -s /stackable/hadoop-${HADOOP}/ /stackable/hadoop
102+
# TODO: Do we really need all of Hadoop in here?
103+
COPY --chown=stackable:stackable --from=hadoop-builder /stackable/hadoop /stackable/hadoop-${HADOOP}
104+
RUN ln -s /stackable/hadoop-${HADOOP} /stackable/hadoop
105+
106+
# The next two sections for S3 and Azure use hardcoded version numbers on purpose instead of wildcards
107+
# This way the build will fail should one of the files not be available anymore in a later Hadoop version!
108+
109+
# Add S3 Support for Hive (support for s3a://)
110+
RUN cp /stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar /stackable/hive-metastore/lib/
111+
RUN cp /stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar /stackable/hive-metastore/lib/
112+
113+
# Add Azure ABFS support (support for abfs://)
114+
RUN cp /stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar /stackable/hive-metastore/lib/
115+
RUN cp /stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar /stackable/hive-metastore/lib/
116+
RUN cp /stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar /stackable/hive-metastore/lib/
121117

122-
COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx
118+
COPY --chown=stackable:stackable --from=hive-builder /stackable/jmx /stackable/jmx
123119
COPY hive/licenses /licenses
124120

125121
ENV HADOOP_HOME=/stackable/hadoop

hive/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Stackable Docker image including Apache Hive
2+
3+
This is our Dockerfile for the Apache Hive metastore.
4+
5+
It is building Hive from source.
6+
We use the officially released source tarballs and apply patches on top.
7+
These patches can also be seen in our [fork](https://github.com/stackabletech/hive) of Hive.
8+
9+
Look for the `stackable/` branches.
10+
11+
The patches were created using [Stacked Git](https://stacked-git.github.io/) but that, unfortunately, does not allow sharing the state remotely.
12+
We do not have a good solution for this yet.
13+
14+
The command used was: `stg export --dir patches/3.1.3 -p -n` but this would require you to get your local fork into a state where it recognizes all _our_ commits as patches first.

hive/stackable/bin/start-metastore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# --hive-bin-dir <path>
1111
#
1212
# Checks if the metastore database schema is initialized. If so it starts the metastore,
13-
# otherwise it tries to initialize the schma first.
13+
# otherwise it tries to initialize the schema first.
1414
#
1515

1616
#set -x

hive/stackable/patches/3.1.3/001-HIVE-26905-3.1.3.patch renamed to hive/stackable/patches/3.1.3/01-HIVE-26905.patch

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
From 5e9b99cdcac99bcce86fc040b066fdfbc319060d Mon Sep 17 00:00:00 2001
2-
From: Chris Nauroth <[email protected]>
3-
Date: Thu, 5 Jan 2023 05:19:41 +0000
4-
Subject: [PATCH] HIVE-26905: Backport HIVE-25173 to 3.2.0: Exclude
5-
pentaho-aggdesigner-algorithm from upgrade-acid build.
1+
HIVE-26905
62

3+
From: Lars Francke <[email protected]>
4+
5+
Backport HIVE-25173 to 3.2.0: Exclude pentaho-aggdesigner-algorithm from upgrade-acid build.
76
---
8-
upgrade-acid/pom.xml | 6 ++++++
7+
upgrade-acid/pom.xml | 6 ++++++
98
1 file changed, 6 insertions(+)
109

1110
diff --git a/upgrade-acid/pom.xml b/upgrade-acid/pom.xml
12-
index 5002fc6f6317..131f86fc1f7d 100644
11+
index f95117e07d..b25b332b34 100644
1312
--- a/upgrade-acid/pom.xml
1413
+++ b/upgrade-acid/pom.xml
1514
@@ -80,6 +80,12 @@

hive/stackable/patches/3.1.3/002-HIVE-21939-3.1.3.patch renamed to hive/stackable/patches/3.1.3/02-HIVE-21939.patch

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
1-
From d0fc55929e2fb00dbd84fb092771bf558dd20f16 Mon Sep 17 00:00:00 2001
2-
From: Sebastian Bernauer <[email protected]>
3-
Date: Thu, 11 Apr 2024 15:41:05 +0200
4-
Subject: [PATCH] HIVE-21939: protoc:2.5.0 dependence has broken building on
5-
aarch64
1+
HIVE-21939
62

7-
Cherry-picked from 2baf21bb55fcf33d8522444c78a8d8cab60e7415
3+
From: Lars Francke <[email protected]>
84

9-
Co-authored-by: Chinna Rao L <[email protected]>
5+
protoc:2.5.0 dependence has broken building on aarch64
106
---
11-
standalone-metastore/pom.xml | 21 +++++++++++++++++++--
7+
standalone-metastore/pom.xml | 21 +++++++++++++++++++--
128
1 file changed, 19 insertions(+), 2 deletions(-)
139

1410
diff --git a/standalone-metastore/pom.xml b/standalone-metastore/pom.xml
@@ -26,7 +22,7 @@ index e36f1e64f0..6007b7961b 100644
2622
+ <protobuf-exc.version>2.6.1</protobuf-exc.version>
2723
<sqlline.version>1.3.0</sqlline.version>
2824
<storage-api.version>2.7.0</storage-api.version>
29-
25+
3026
@@ -443,6 +446,20 @@
3127
</plugins>
3228
</build>
@@ -57,6 +53,3 @@ index e36f1e64f0..6007b7961b 100644
5753
<addSources>none</addSources>
5854
<inputDirectories>
5955
<include>${basedir}/src/main/protobuf/org/apache/hadoop/hive/metastore</include>
60-
--
61-
2.43.0
62-

0 commit comments

Comments
 (0)