diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b67a2477..1e8fdb9ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file. - hbase: Add hadoop-azure.jar to the lib directory to support the Azure Blob Filesystem and the Azure Data Lake Storage ([#853]). - kafka: Add cyrus-sasl-gssapi package for kerberos ([#874]). +- spark: Add HBase connector ([#878]). ### Changed @@ -68,6 +69,7 @@ All notable changes to this project will be documented in this file. [#868]: https://github.com/stackabletech/docker-images/pull/868 [#874]: https://github.com/stackabletech/docker-images/pull/874 [#877]: https://github.com/stackabletech/docker-images/pull/877 +[#878]: https://github.com/stackabletech/docker-images/pull/878 [#879]: https://github.com/stackabletech/docker-images/pull/879 ## [24.7.0] - 2024-07-24 diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index c166d6e54..b62fc2ada 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -1,18 +1,16 @@ # syntax=docker/dockerfile:1.8.1@sha256:e87caa74dcb7d46cd820352bfea12591f3dba3ddc4285e19c7dcd13359f7cefd -FROM stackable/image/java-devel as builder +# hadoop-builder: Provides Hadoop libraries +FROM stackable/image/hadoop AS hadoop-builder + +# hbase-builder: Provides HBase libraries +FROM stackable/image/hbase AS hbase-builder + +# spark-source-builder: Download the Spark source code into +# /stackable/spark and apply the patches +FROM stackable/image/java-devel as spark-source-builder ARG PRODUCT -ARG HADOOP_LONG_VERSION -ARG AWS_JAVA_SDK_BUNDLE -ARG AZURE_STORAGE -ARG AZURE_KEYVAULT_CORE -ARG JACKSON_DATAFORMAT_XML -ARG STAX2_API -ARG WOODSTOX_CORE -ARG JMX_EXPORTER -ARG TARGETARCH -ARG TINI RUN <>> Build spark # Compiling the tests takes a lot of time, so we skip them @@ -37,12 +162,9 @@ COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackab # # This will download it's own version of maven because the UBI version is too old: # 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,) -RUN curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz | tar -xzf - \ - && cd spark-${PRODUCT} \ - && ./patches/apply_patches.sh ${PRODUCT} \ - && export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \ +RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \ && ./dev/make-distribution.sh \ - -Dhadoop.version="$HADOOP_LONG_VERSION" \ + -Dhadoop.version="$HADOOP" \ -Dmaven.test.skip=true \ -DskipTests \ -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver @@ -55,12 +177,40 @@ RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/ # We download these under dist so that log4shell checks them WORKDIR /stackable/spark-${PRODUCT}/dist/jars -# Download various modules for Hadoop (e.g. support for s3a:// and abfs://) -RUN curl -O https://repo.stackable.tech/repository/packages/aws/hadoop-aws-${HADOOP_LONG_VERSION}.jar \ - && curl -O https://repo.stackable.tech/repository/packages/aws/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \ - && curl -O https://repo.stackable.tech/repository/packages/azure/hadoop-azure-${HADOOP_LONG_VERSION}.jar \ - && curl -O https://repo.stackable.tech/repository/packages/azure/azure-storage-${AZURE_STORAGE}.jar \ - && curl -O https://repo.stackable.tech/repository/packages/azure/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar +# Copy modules required for s3a:// +COPY --from=hadoop-builder --chown=stackable:stackable \ + /stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar \ + /stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \ + ./ + +# Copy modules required for abfs:// +COPY --from=hadoop-builder --chown=stackable:stackable \ + /stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar \ + /stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar \ + /stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar \ + ./ + +# Copy the HBase connector including required modules +COPY --from=hbase-connectors-builder --chown=stackable:stackable \ + /stackable/spark/jars/* \ + ./ + +# Copy modules required to access HBase +COPY --from=hbase-builder --chown=stackable:stackable \ + /stackable/hbase/lib/shaded-clients/hbase-shaded-client-byo-hadoop-${HBASE}.jar \ + /stackable/hbase/lib/shaded-clients/hbase-shaded-mapreduce-${HBASE}.jar \ + ./ +# Copy modules required to access HBase if $HBASE == 2.4.x +COPY --from=hbase-builder --chown=stackable:stackable \ + /stackable/hbase/lib/client-facing-thirdparty/htrace-core4-*-incubating.jar \ + /stackable/hbase/lib/client-facing-thirdparty/slf4j-reload4j-*.jar \ + ./ +# Copy modules required to access HBase if $HBASE == 2.6.x +COPY --from=hbase-builder --chown=stackable:stackable \ + /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-api-*.jar \ + /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-context-*.jar \ + /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \ + ./ WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars @@ -93,6 +243,7 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist # === + FROM stackable/image/java-base as final ARG PRODUCT @@ -115,12 +266,12 @@ RUN microdnf update && \ hostname \ # required for spark startup scripts procps \ - python${PYTHON} \ - python${PYTHON}-pip \ + "python${PYTHON}" \ + "python${PYTHON}-pip" \ zip \ # This is needed by the Spark UI to display process information using jps and jmap # Copying the binaries from the builder stage failed. - java-${JAVA_VERSION}-openjdk-devel \ + "java-${JAVA_VERSION}-openjdk-devel" \ && microdnf clean all \ && rm -rf /var/cache/yum @@ -134,10 +285,10 @@ ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/b ENV PYSPARK_PYTHON=/usr/bin/python ENV PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/dist /stackable/spark -COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json -COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx -COPY --from=builder /usr/bin/tini /usr/bin/tini +COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark +COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json +COPY --chown=stackable:stackable --from=spark-builder /stackable/jmx /stackable/jmx +COPY --from=spark-builder /usr/bin/tini /usr/bin/tini RUN ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar \ # Symlink example jar, so that we can easily use it in tests diff --git a/spark-k8s/upload_new_hbase-connector_version.sh b/spark-k8s/upload_new_hbase-connector_version.sh new file mode 100755 index 000000000..6cc74533a --- /dev/null +++ b/spark-k8s/upload_new_hbase-connector_version.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +set -euo pipefail + +VERSION=${1:?"Missing version number argument (arg 1)"} +NEXUS_USER=${2:?"Missing Nexus username argument (arg 2)"} + +read -r -s -p "Nexus Password: " NEXUS_PASSWORD +echo "" + +# https://stackoverflow.com/questions/4632028/how-to-create-a-temporary-directory +# Find the directory name of the script +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# the temp directory used, within $DIR +WORK_DIR=$(mktemp -d -p "$DIR") + +# check if tmp dir was created +if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then + echo "Could not create temp dir" + exit 1 +fi + +# deletes the temp directory +function cleanup { + rm -rf "$WORK_DIR" +} + +# register the cleanup function to be called on the EXIT signal +trap cleanup EXIT + +cd "$WORK_DIR" || exit + +download_url="https://github.com/apache/hbase-connectors/archive/refs/tags/rel/${VERSION}.tar.gz" + +tar_gz_file="hbase-connectors_${VERSION}.tar.gz" + +echo "Downloading hbase-connectors source from ${download_url}" +curl --fail -L -o "${tar_gz_file}" "${download_url}" + +echo "Uploading hbase-connectors source to Nexus" +EXIT_STATUS=0 +curl --fail -u "$NEXUS_USER:$NEXUS_PASSWORD" --upload-file "${tar_gz_file}" 'https://repo.stackable.tech/repository/packages/hbase-connectors/' || EXIT_STATUS=$? + +if [ $EXIT_STATUS -ne 0 ]; then + echo "ERROR: Upload failed" + exit 1 +fi + +echo "Successfully uploaded version $VERSION of hbase-connectors to Nexus" +echo "https://repo.stackable.tech/service/rest/repository/browse/packages/hbase-connectors/" diff --git a/spark-k8s/versions.py b/spark-k8s/versions.py index bb9ad5b72..f586a2775 100644 --- a/spark-k8s/versions.py +++ b/spark-k8s/versions.py @@ -4,7 +4,8 @@ "java-base": "17", "java-devel": "17", "python": "3.11", - "hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125 + "hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125 + "hbase": "2.4.18", # current Stackable LTS version "aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 "azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4 "azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1 @@ -14,13 +15,15 @@ "vector": "0.41.1", "jmx_exporter": "1.0.1", "tini": "0.19.0", + "hbase_connector": "1.0.1", }, { "product": "3.5.2", "java-base": "17", "java-devel": "17", "python": "3.11", - "hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125 + "hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125 + "hbase": "2.4.18", # current Stackable LTS version "aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 "azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4 "azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1 @@ -30,5 +33,6 @@ "vector": "0.41.1", "jmx_exporter": "1.0.1", "tini": "0.19.0", + "hbase_connector": "1.0.1", }, ]