Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file.
- hbase: Add hadoop-azure.jar to the lib directory to support the Azure Blob Filesystem and
the Azure Data Lake Storage ([#853]).
- kafka: Add cyrus-sasl-gssapi package for kerberos ([#874]).
- spark: Add HBase connector ([#878]).

### Changed

Expand Down Expand Up @@ -68,6 +69,7 @@ All notable changes to this project will be documented in this file.
[#868]: https://github.com/stackabletech/docker-images/pull/868
[#874]: https://github.com/stackabletech/docker-images/pull/874
[#877]: https://github.com/stackabletech/docker-images/pull/877
[#878]: https://github.com/stackabletech/docker-images/pull/878

## [24.7.0] - 2024-07-24

Expand Down
211 changes: 180 additions & 31 deletions spark-k8s/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# syntax=docker/dockerfile:1.8.1@sha256:e87caa74dcb7d46cd820352bfea12591f3dba3ddc4285e19c7dcd13359f7cefd

FROM stackable/image/java-devel as builder
# hadoop-builder: Provides Hadoop libraries
FROM stackable/image/hadoop AS hadoop-builder

# hbase-builder: Provides HBase libraries
FROM stackable/image/hbase AS hbase-builder

# spark-source-builder: Download the Spark source code into
# /stackable/spark and apply the patches
FROM stackable/image/java-devel as spark-source-builder

ARG PRODUCT
ARG HADOOP_LONG_VERSION
ARG AWS_JAVA_SDK_BUNDLE
ARG AZURE_STORAGE
ARG AZURE_KEYVAULT_CORE
ARG JACKSON_DATAFORMAT_XML
ARG STAX2_API
ARG WOODSTOX_CORE
ARG JMX_EXPORTER
ARG TARGETARCH
ARG TINI

RUN <<EOF
microdnf update
Expand All @@ -27,8 +25,133 @@ EOF

WORKDIR /stackable

COPY --chown=stackable:stackable spark-k8s/stackable/patches/apply_patches.sh /stackable/spark-${PRODUCT}/patches/apply_patches.sh
COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackable/spark-${PRODUCT}/patches/${PRODUCT}
RUN <<EOF
curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz \
| tar xz
ln -s spark-${PRODUCT} spark
EOF

WORKDIR /stackable/spark

COPY --chown=stackable:stackable \
spark-k8s/stackable/patches/apply_patches.sh \
patches/apply_patches.sh
COPY --chown=stackable:stackable \
spark-k8s/stackable/patches/${PRODUCT} \
patches/${PRODUCT}

RUN patches/apply_patches.sh ${PRODUCT}


# hbase-connectors-builder: Build the Spark HBase connector and copy
# required JARs into /stackable/spark/jars
FROM stackable/image/java-devel as hbase-connectors-builder

ARG PRODUCT
ARG HADOOP
ARG HBASE
ARG HBASE_CONNECTOR

WORKDIR /stackable

# Download the hbase-connectors source code
RUN <<EOF
curl https://repo.stackable.tech/repository/packages/hbase-connectors/hbase-connectors_${HBASE_CONNECTOR}.tar.gz \
| tar xz
ln -s hbase-connectors-rel-${HBASE_CONNECTOR} hbase-connectors
EOF

# Copy the pom.xml file from the patched Spark source code to read the
# versions used by Spark. The pom.xml defines child modules which are
# not required and not copied, therefore mvn must be called with the
# parameter --non-recursive.
COPY --chown=stackable:stackable --from=spark-source-builder \
/stackable/spark/pom.xml \
spark/

WORKDIR /stackable/hbase-connectors/spark

RUN <<EOF
# Building the hbase-connectors with JDK 17 is not yet supported, see
# https://github.com/apache/hbase-connectors/pull/132.
# As there are no JDK profiles, access to the non-public elements must
# be enabled with --add-opens, see https://openjdk.org/jeps/403 and
# https://openjdk.org/jeps/261#Breaking-encapsulation.
export JDK_JAVA_OPTIONS="\
--add-opens java.base/java.lang=ALL-UNNAMED \
--add-opens java.base/java.util=ALL-UNNAMED"

# Get the Scala version used by Spark
SCALA_VERSION=$( \
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-DforceStdout \
-Dexpression='project.properties(scala.version)')

# Get the Scala binary version used by Spark
SCALA_BINARY_VERSION=$( \
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-DforceStdout \
-Dexpression='project.properties(scala.binary.version)')

# Build the Spark HBase connector
# Skip the tests because the MiniHBaseCluster does not get ready for
# whatever reason:
# Caused by: java.lang.RuntimeException: Master not active after 30000ms
# at org.apache.hadoop.hbase.util.JVMClusterUtil.waitForEvent(JVMClusterUtil.java:221)
# at org.apache.hadoop.hbase.util.JVMClusterUtil.startup(JVMClusterUtil.java:177)
# at org.apache.hadoop.hbase.LocalHBaseCluster.startup(LocalHBaseCluster.java:407)
# at org.apache.hadoop.hbase.MiniHBaseCluster.init(MiniHBaseCluster.java:250)
mvn \
--define spark.version="${PRODUCT}" \
--define scala.version="${SCALA_VERSION}" \
--define scala.binary.version="${SCALA_BINARY_VERSION}" \
--define hadoop-three.version="${HADOOP}" \
--define hbase.version="${HBASE}" \
--define skipTests \
clean package
EOF

WORKDIR /stackable/spark/jars

RUN <<EOF
ln -s /stackable/hbase-connectors/spark/hbase-spark/target/hbase-spark-${HBASE_CONNECTOR}.jar

# Download log4j-slf4j-impl-x.x.x.jar containing the StaticLoggerBinder
# which is required by the connector.
# Spark contains only log4j-slf4j2-impl-x.x.x.jar but not
# log4j-slf4j-impl-x.x.x.jar. It is okay to have both JARs in the
# classpath as long as they have the same version.
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
dependency:copy \
-Dartifact=org.apache.logging.log4j:log4j-slf4j-impl:'${log4j.version}' \
-DoutputDirectory=.
EOF


# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
# download additional JARs and perform checks, like log4shell check.
FROM stackable/image/java-devel as spark-builder

ARG PRODUCT
ARG HADOOP
ARG HBASE
ARG AWS_JAVA_SDK_BUNDLE
ARG AZURE_STORAGE
ARG AZURE_KEYVAULT_CORE
ARG JACKSON_DATAFORMAT_XML
ARG STAX2_API
ARG WOODSTOX_CORE
ARG JMX_EXPORTER
ARG TARGETARCH
ARG TINI

WORKDIR /stackable/spark-${PRODUCT}

COPY --chown=stackable:stackable --from=spark-source-builder \
/stackable/spark/ \
./

# >>> Build spark
# Compiling the tests takes a lot of time, so we skip them
Expand All @@ -37,12 +160,9 @@ COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackab
#
# This will download it's own version of maven because the UBI version is too old:
# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
RUN curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz | tar -xzf - \
&& cd spark-${PRODUCT} \
&& ./patches/apply_patches.sh ${PRODUCT} \
&& export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
&& ./dev/make-distribution.sh \
-Dhadoop.version="$HADOOP_LONG_VERSION" \
-Dhadoop.version="$HADOOP" \
-Dmaven.test.skip=true \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
Expand All @@ -55,12 +175,40 @@ RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/
# We download these under dist so that log4shell checks them
WORKDIR /stackable/spark-${PRODUCT}/dist/jars

# Download various modules for Hadoop (e.g. support for s3a:// and abfs://)
RUN curl -O https://repo.stackable.tech/repository/packages/aws/hadoop-aws-${HADOOP_LONG_VERSION}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/aws/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/hadoop-azure-${HADOOP_LONG_VERSION}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-storage-${AZURE_STORAGE}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar
# Copy modules required for s3a://
COPY --from=hadoop-builder --chown=stackable:stackable \
/stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar \
/stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
./

# Copy modules required for abfs://
COPY --from=hadoop-builder --chown=stackable:stackable \
/stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar \
/stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar \
/stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar \
./

# Copy the HBase connector including required modules
COPY --from=hbase-connectors-builder --chown=stackable:stackable \
/stackable/spark/jars/* \
./

# Copy modules required to access HBase
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/shaded-clients/hbase-shaded-client-byo-hadoop-${HBASE}.jar \
/stackable/hbase/lib/shaded-clients/hbase-shaded-mapreduce-${HBASE}.jar \
./
# Copy modules required to access HBase if $HBASE == 2.4.x
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/client-facing-thirdparty/htrace-core4-*-incubating.jar \
/stackable/hbase/lib/client-facing-thirdparty/slf4j-reload4j-*.jar \
./
# Copy modules required to access HBase if $HBASE == 2.6.x
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-api-*.jar \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-context-*.jar \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
./

WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars

Expand Down Expand Up @@ -93,6 +241,7 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner
RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
# ===


FROM stackable/image/java-base as final

ARG PRODUCT
Expand All @@ -115,12 +264,12 @@ RUN microdnf update && \
hostname \
# required for spark startup scripts
procps \
python${PYTHON} \
python${PYTHON}-pip \
"python${PYTHON}" \
"python${PYTHON}-pip" \
zip \
# This is needed by the Spark UI to display process information using jps and jmap
# Copying the binaries from the builder stage failed.
java-${JAVA_VERSION}-openjdk-devel \
"java-${JAVA_VERSION}-openjdk-devel" \
&& microdnf clean all \
&& rm -rf /var/cache/yum

Expand All @@ -134,10 +283,10 @@ ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/b
ENV PYSPARK_PYTHON=/usr/bin/python
ENV PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH

COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/dist /stackable/spark
COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx
COPY --from=builder /usr/bin/tini /usr/bin/tini
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
COPY --chown=stackable:stackable --from=spark-builder /stackable/jmx /stackable/jmx
COPY --from=spark-builder /usr/bin/tini /usr/bin/tini

RUN ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar \
# Symlink example jar, so that we can easily use it in tests
Expand Down
51 changes: 51 additions & 0 deletions spark-k8s/upload_new_hbase-connector_version.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

set -euo pipefail

VERSION=${1:?"Missing version number argument (arg 1)"}
NEXUS_USER=${2:?"Missing Nexus username argument (arg 2)"}

read -r -s -p "Nexus Password: " NEXUS_PASSWORD
echo ""

# https://stackoverflow.com/questions/4632028/how-to-create-a-temporary-directory
# Find the directory name of the script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# the temp directory used, within $DIR
WORK_DIR=$(mktemp -d -p "$DIR")

# check if tmp dir was created
if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
echo "Could not create temp dir"
exit 1
fi

# deletes the temp directory
function cleanup {
rm -rf "$WORK_DIR"
}

# register the cleanup function to be called on the EXIT signal
trap cleanup EXIT

cd "$WORK_DIR" || exit

download_url="https://github.com/apache/hbase-connectors/archive/refs/tags/rel/${VERSION}.tar.gz"

tar_gz_file="hbase-connectors_${VERSION}.tar.gz"

echo "Downloading hbase-connectors source from ${download_url}"
curl --fail -L -o "${tar_gz_file}" "${download_url}"

echo "Uploading hbase-connectors source to Nexus"
EXIT_STATUS=0
curl --fail -u "$NEXUS_USER:$NEXUS_PASSWORD" --upload-file "${tar_gz_file}" 'https://repo.stackable.tech/repository/packages/hbase-connectors/' || EXIT_STATUS=$?

if [ $EXIT_STATUS -ne 0 ]; then
echo "ERROR: Upload failed"
exit 1
fi

echo "Successfully uploaded version $VERSION of hbase-connectors to Nexus"
echo "https://repo.stackable.tech/service/rest/repository/browse/packages/hbase-connectors/"
8 changes: 6 additions & 2 deletions spark-k8s/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"java-base": "17",
"java-devel": "17",
"python": "3.11",
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hbase": "2.4.18", # current Stackable LTS version
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
Expand All @@ -14,13 +15,15 @@
"vector": "0.41.1",
"jmx_exporter": "1.0.1",
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
{
"product": "3.5.2",
"java-base": "17",
"java-devel": "17",
"python": "3.11",
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hbase": "2.4.18", # current Stackable LTS version
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
Expand All @@ -30,5 +33,6 @@
"vector": "0.41.1",
"jmx_exporter": "1.0.1",
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
]