Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file.
- hbase: Add hadoop-azure.jar to the lib directory to support the Azure Blob Filesystem and
the Azure Data Lake Storage ([#853]).
- kafka: Add cyrus-sasl-gssapi package for kerberos ([#874]).
- spark: Add HBase connector ([#878]).

### Changed

Expand Down Expand Up @@ -68,6 +69,7 @@ All notable changes to this project will be documented in this file.
[#868]: https://github.com/stackabletech/docker-images/pull/868
[#874]: https://github.com/stackabletech/docker-images/pull/874
[#877]: https://github.com/stackabletech/docker-images/pull/877
[#878]: https://github.com/stackabletech/docker-images/pull/878
[#879]: https://github.com/stackabletech/docker-images/pull/879

## [24.7.0] - 2024-07-24
Expand Down
213 changes: 182 additions & 31 deletions spark-k8s/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# syntax=docker/dockerfile:1.8.1@sha256:e87caa74dcb7d46cd820352bfea12591f3dba3ddc4285e19c7dcd13359f7cefd

FROM stackable/image/java-devel as builder
# hadoop-builder: Provides Hadoop libraries
FROM stackable/image/hadoop AS hadoop-builder

# hbase-builder: Provides HBase libraries
FROM stackable/image/hbase AS hbase-builder

# spark-source-builder: Download the Spark source code into
# /stackable/spark and apply the patches
FROM stackable/image/java-devel as spark-source-builder

ARG PRODUCT
ARG HADOOP_LONG_VERSION
ARG AWS_JAVA_SDK_BUNDLE
ARG AZURE_STORAGE
ARG AZURE_KEYVAULT_CORE
ARG JACKSON_DATAFORMAT_XML
ARG STAX2_API
ARG WOODSTOX_CORE
ARG JMX_EXPORTER
ARG TARGETARCH
ARG TINI

RUN <<EOF
microdnf update
Expand All @@ -27,8 +25,135 @@ EOF

WORKDIR /stackable

COPY --chown=stackable:stackable spark-k8s/stackable/patches/apply_patches.sh /stackable/spark-${PRODUCT}/patches/apply_patches.sh
COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackable/spark-${PRODUCT}/patches/${PRODUCT}
RUN <<EOF
curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz \
| tar xz
ln -s spark-${PRODUCT} spark
EOF

WORKDIR /stackable/spark

COPY --chown=stackable:stackable \
spark-k8s/stackable/patches/apply_patches.sh \
patches/apply_patches.sh
COPY --chown=stackable:stackable \
spark-k8s/stackable/patches/${PRODUCT} \
patches/${PRODUCT}

RUN patches/apply_patches.sh ${PRODUCT}


# hbase-connectors-builder: Build the Spark HBase connector and copy
# required JARs into /stackable/spark/jars
FROM stackable/image/java-devel as hbase-connectors-builder

ARG PRODUCT
ARG HADOOP
ARG HBASE
ARG HBASE_CONNECTOR

WORKDIR /stackable

# Download the hbase-connectors source code
RUN <<EOF
curl https://repo.stackable.tech/repository/packages/hbase-connectors/hbase-connectors_${HBASE_CONNECTOR}.tar.gz \
| tar xz
ln -s hbase-connectors-rel-${HBASE_CONNECTOR} hbase-connectors
EOF

# Copy the pom.xml file from the patched Spark source code to read the
# versions used by Spark. The pom.xml defines child modules which are
# not required and not copied, therefore mvn must be called with the
# parameter --non-recursive.
COPY --chown=stackable:stackable --from=spark-source-builder \
/stackable/spark/pom.xml \
spark/

WORKDIR /stackable/hbase-connectors/spark

RUN <<EOF
# Building the hbase-connectors with JDK 17 is not yet supported, see
# https://github.com/apache/hbase-connectors/pull/132.
# As there are no JDK profiles, access to the non-public elements must
# be enabled with --add-opens, see https://openjdk.org/jeps/403 and
# https://openjdk.org/jeps/261#Breaking-encapsulation.
export JDK_JAVA_OPTIONS="\
--add-opens java.base/java.lang=ALL-UNNAMED \
--add-opens java.base/java.util=ALL-UNNAMED"

# Get the Scala version used by Spark
SCALA_VERSION=$( \
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-DforceStdout \
-Dexpression='project.properties(scala.version)')

# Get the Scala binary version used by Spark
SCALA_BINARY_VERSION=$( \
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-DforceStdout \
-Dexpression='project.properties(scala.binary.version)')

# Build the Spark HBase connector
# Skip the tests because the MiniHBaseCluster does not get ready for
# whatever reason:
# Caused by: java.lang.RuntimeException: Master not active after 30000ms
# at org.apache.hadoop.hbase.util.JVMClusterUtil.waitForEvent(JVMClusterUtil.java:221)
# at org.apache.hadoop.hbase.util.JVMClusterUtil.startup(JVMClusterUtil.java:177)
# at org.apache.hadoop.hbase.LocalHBaseCluster.startup(LocalHBaseCluster.java:407)
# at org.apache.hadoop.hbase.MiniHBaseCluster.init(MiniHBaseCluster.java:250)
mvn \
--batch-mode \
--no-transfer-progress \
--define spark.version="${PRODUCT}" \
--define scala.version="${SCALA_VERSION}" \
--define scala.binary.version="${SCALA_BINARY_VERSION}" \
--define hadoop-three.version="${HADOOP}" \
--define hbase.version="${HBASE}" \
--define skipTests \
clean package
EOF

WORKDIR /stackable/spark/jars

RUN <<EOF
ln -s /stackable/hbase-connectors/spark/hbase-spark/target/hbase-spark-${HBASE_CONNECTOR}.jar

# Download log4j-slf4j-impl-x.x.x.jar containing the StaticLoggerBinder
# which is required by the connector.
# Spark contains only log4j-slf4j2-impl-x.x.x.jar but not
# log4j-slf4j-impl-x.x.x.jar. It is okay to have both JARs in the
# classpath as long as they have the same version.
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
dependency:copy \
-Dartifact=org.apache.logging.log4j:log4j-slf4j-impl:'${log4j.version}' \
-DoutputDirectory=.
EOF


# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
# download additional JARs and perform checks, like log4shell check.
FROM stackable/image/java-devel as spark-builder

ARG PRODUCT
ARG HADOOP
ARG HBASE
ARG AWS_JAVA_SDK_BUNDLE
ARG AZURE_STORAGE
ARG AZURE_KEYVAULT_CORE
ARG JACKSON_DATAFORMAT_XML
ARG STAX2_API
ARG WOODSTOX_CORE
ARG JMX_EXPORTER
ARG TARGETARCH
ARG TINI

WORKDIR /stackable/spark-${PRODUCT}

COPY --chown=stackable:stackable --from=spark-source-builder \
/stackable/spark/ \
./

# >>> Build spark
# Compiling the tests takes a lot of time, so we skip them
Expand All @@ -37,12 +162,9 @@ COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackab
#
# This will download it's own version of maven because the UBI version is too old:
# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
RUN curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz | tar -xzf - \
&& cd spark-${PRODUCT} \
&& ./patches/apply_patches.sh ${PRODUCT} \
&& export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
&& ./dev/make-distribution.sh \
-Dhadoop.version="$HADOOP_LONG_VERSION" \
-Dhadoop.version="$HADOOP" \
-Dmaven.test.skip=true \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
Expand All @@ -55,12 +177,40 @@ RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/
# We download these under dist so that log4shell checks them
WORKDIR /stackable/spark-${PRODUCT}/dist/jars

# Download various modules for Hadoop (e.g. support for s3a:// and abfs://)
RUN curl -O https://repo.stackable.tech/repository/packages/aws/hadoop-aws-${HADOOP_LONG_VERSION}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/aws/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/hadoop-azure-${HADOOP_LONG_VERSION}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-storage-${AZURE_STORAGE}.jar \
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar
# Copy modules required for s3a://
COPY --from=hadoop-builder --chown=stackable:stackable \
/stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar \
/stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
./

# Copy modules required for abfs://
COPY --from=hadoop-builder --chown=stackable:stackable \
/stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar \
/stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar \
/stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar \
./

# Copy the HBase connector including required modules
COPY --from=hbase-connectors-builder --chown=stackable:stackable \
/stackable/spark/jars/* \
./

# Copy modules required to access HBase
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/shaded-clients/hbase-shaded-client-byo-hadoop-${HBASE}.jar \
/stackable/hbase/lib/shaded-clients/hbase-shaded-mapreduce-${HBASE}.jar \
./
# Copy modules required to access HBase if $HBASE == 2.4.x
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/client-facing-thirdparty/htrace-core4-*-incubating.jar \
/stackable/hbase/lib/client-facing-thirdparty/slf4j-reload4j-*.jar \
./
# Copy modules required to access HBase if $HBASE == 2.6.x
COPY --from=hbase-builder --chown=stackable:stackable \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-api-*.jar \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-context-*.jar \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
./

WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars

Expand Down Expand Up @@ -93,6 +243,7 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner
RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
# ===


FROM stackable/image/java-base as final

ARG PRODUCT
Expand All @@ -115,12 +266,12 @@ RUN microdnf update && \
hostname \
# required for spark startup scripts
procps \
python${PYTHON} \
python${PYTHON}-pip \
"python${PYTHON}" \
"python${PYTHON}-pip" \
zip \
# This is needed by the Spark UI to display process information using jps and jmap
# Copying the binaries from the builder stage failed.
java-${JAVA_VERSION}-openjdk-devel \
"java-${JAVA_VERSION}-openjdk-devel" \
&& microdnf clean all \
&& rm -rf /var/cache/yum

Expand All @@ -134,10 +285,10 @@ ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/b
ENV PYSPARK_PYTHON=/usr/bin/python
ENV PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH

COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/dist /stackable/spark
COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx
COPY --from=builder /usr/bin/tini /usr/bin/tini
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
COPY --chown=stackable:stackable --from=spark-builder /stackable/jmx /stackable/jmx
COPY --from=spark-builder /usr/bin/tini /usr/bin/tini

RUN ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar \
# Symlink example jar, so that we can easily use it in tests
Expand Down
51 changes: 51 additions & 0 deletions spark-k8s/upload_new_hbase-connector_version.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

set -euo pipefail

VERSION=${1:?"Missing version number argument (arg 1)"}
NEXUS_USER=${2:?"Missing Nexus username argument (arg 2)"}

read -r -s -p "Nexus Password: " NEXUS_PASSWORD
echo ""

# https://stackoverflow.com/questions/4632028/how-to-create-a-temporary-directory
# Find the directory name of the script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# the temp directory used, within $DIR
WORK_DIR=$(mktemp -d -p "$DIR")

# check if tmp dir was created
if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
echo "Could not create temp dir"
exit 1
fi

# deletes the temp directory
function cleanup {
rm -rf "$WORK_DIR"
}

# register the cleanup function to be called on the EXIT signal
trap cleanup EXIT

cd "$WORK_DIR" || exit

download_url="https://github.com/apache/hbase-connectors/archive/refs/tags/rel/${VERSION}.tar.gz"

tar_gz_file="hbase-connectors_${VERSION}.tar.gz"

echo "Downloading hbase-connectors source from ${download_url}"
curl --fail -L -o "${tar_gz_file}" "${download_url}"

echo "Uploading hbase-connectors source to Nexus"
EXIT_STATUS=0
curl --fail -u "$NEXUS_USER:$NEXUS_PASSWORD" --upload-file "${tar_gz_file}" 'https://repo.stackable.tech/repository/packages/hbase-connectors/' || EXIT_STATUS=$?

if [ $EXIT_STATUS -ne 0 ]; then
echo "ERROR: Upload failed"
exit 1
fi

echo "Successfully uploaded version $VERSION of hbase-connectors to Nexus"
echo "https://repo.stackable.tech/service/rest/repository/browse/packages/hbase-connectors/"
8 changes: 6 additions & 2 deletions spark-k8s/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"java-base": "17",
"java-devel": "17",
"python": "3.11",
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hbase": "2.4.18", # current Stackable LTS version
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
Expand All @@ -14,13 +15,15 @@
"vector": "0.41.1",
"jmx_exporter": "1.0.1",
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
{
"product": "3.5.2",
"java-base": "17",
"java-devel": "17",
"python": "3.11",
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
"hbase": "2.4.18", # current Stackable LTS version
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
Expand All @@ -30,5 +33,6 @@
"vector": "0.41.1",
"jmx_exporter": "1.0.1",
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
]