Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions .github/workflows/spark-base-image-build-push.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Build and Push Docker Image

on:
push:
branches:
- '**'
paths:
- 'spark-base-image/**'
workflow_dispatch:

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
nauedu/nau-analytics-base-spark
tags: |
type=sha

- name: Build Docker image (no push yet)
uses: docker/build-push-action@v6
with:
context: ./
file: ./spark-base-image/Dockerfile
load: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

- name: Login to DockerHub
if: ${{ github.event_name != 'pull_request' }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Push Docker image (branch-specific)
if: ${{ github.event_name != 'pull_request' }}
run: |
set -eux
IMAGE_NAME="nauedu/nau-analytics-base-spark"
SHA_TAG=$(echo "${GITHUB_SHA}" | head -c7)

if [ "${GITHUB_REF_NAME}" = "main" ]; then
# Tag and push with 'latest' and SHA
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:latest
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$SHA_TAG

docker push $IMAGE_NAME:latest
docker push $IMAGE_NAME:$SHA_TAG

echo "✅ Pushed tags 'latest' and '$SHA_TAG' for main branch to https://hub.docker.com/r/$IMAGE_NAME/tags"
else
# Tag and push with branch name
BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-')
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$BRANCH_TAG
docker push $IMAGE_NAME:$BRANCH_TAG

echo "✅ Pushed branch tag '$BRANCH_TAG' to https://hub.docker.com/r/$IMAGE_NAME/tags"
fi
70 changes: 70 additions & 0 deletions .github/workflows/spark-shell-build-push.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Build and Push Docker Image

on:
push:
branches:
- '**'
paths:
- 'spark-shell/**'
workflow_dispatch:

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
nauedu/nau-analytics-spark-shell
tags: |
type=sha

- name: Build Docker image (no push yet)
uses: docker/build-push-action@v6
with:
context: ./
file: ./spark-shell/Dockerfile
load: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

- name: Login to DockerHub
if: ${{ github.event_name != 'pull_request' }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Push Docker image (branch-specific)
if: ${{ github.event_name != 'pull_request' }}
run: |
set -eux
IMAGE_NAME="nauedu/nau-analytics-spark-shell"
SHA_TAG=$(echo "${GITHUB_SHA}" | head -c7)

if [ "${GITHUB_REF_NAME}" = "main" ]; then
# Tag and push with 'latest' and SHA
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:latest
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$SHA_TAG

docker push $IMAGE_NAME:latest
docker push $IMAGE_NAME:$SHA_TAG

echo "✅ Pushed tags 'latest' and '$SHA_TAG' for main branch to https://hub.docker.com/r/$IMAGE_NAME/tags"
else
# Tag and push with branch name
BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-')
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$BRANCH_TAG
docker push $IMAGE_NAME:$BRANCH_TAG

echo "✅ Pushed branch tag '$BRANCH_TAG' to https://hub.docker.com/r/$IMAGE_NAME/tags"
fi
2 changes: 2 additions & 0 deletions spark-base-image/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
**
!./*
119 changes: 119 additions & 0 deletions spark-base-image/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# syntax=docker/dockerfile:1
###########################################
# Stage 1: Build Python 3.11.6 from source
###########################################
FROM ubuntu:22.04 AS python-build
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.11.6
ENV PREFIX=/usr/local
RUN apt-get update && apt-get install -y \
build-essential \
wget \
zlib1g-dev \
libncurses5-dev \
libgdbm-dev \
libnss3-dev \
libssl-dev \
libreadline-dev \
libffi-dev \
libsqlite3-dev \
libbz2-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /usr/src
RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar -xzf Python-${PYTHON_VERSION}.tgz
WORKDIR /usr/src/Python-${PYTHON_VERSION}
RUN ./configure --enable-optimizations --prefix=${PREFIX} \
&& make -j"$(nproc)" \
&& make altinstall
RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \
&& ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip

###########################################
# Stage 2: Get entrypoint from official Spark
###########################################
FROM apache/spark:3.5.7 AS spark-official

###########################################
# Stage 3: Spark + Delta + Cloud connectors
###########################################
FROM ubuntu:22.04 AS spark-base
ARG SPARK_VERSION=3.5.7
ARG HADOOP_VERSION=3
ARG DELTA_VERSION=3.2.1
ENV DEBIAN_FRONTEND=noninteractive
ENV SPARK_HOME=/opt/spark
ENV PATH=$SPARK_HOME/bin:$PATH

# Install Java + basic utilities
RUN apt-get update && apt-get install -y \
openjdk-11-jdk \
curl \
wget \
bash \
tini \
ca-certificates \
procps \
&& rm -rf /var/lib/apt/lists/*

# Copy compiled Python
COPY --from=python-build /usr/local /usr/local

# Copy entrypoint script from official Spark image
COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh
COPY --from=spark-official /opt/decom.sh /opt/decom.sh
RUN chmod +x /opt/entrypoint.sh /opt/decom.sh

# Download Apache Spark prebuilt for Hadoop 3
WORKDIR /opt
RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz

# Add useful connectors (Delta, AWS, Azure, MySQL)
WORKDIR $SPARK_HOME/jars
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \
wget https://repo1.maven.org/maven2/com/microsoft/azure/azure-storage/8.6.6/azure-storage-8.6.6.jar && \
wget https://repo1.maven.org/maven2/com/azure/azure-storage-blob/12.24.0/azure-storage-blob-12.24.0.jar && \
wget https://repo1.maven.org/maven2/com/azure/azure-identity/1.7.0/azure-identity-1.7.0.jar && \
wget https://repo1.maven.org/maven2/com/azure/azure-core/1.42.0/azure-core-1.42.0.jar && \
wget https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/${DELTA_VERSION}/delta-spark_2.12-${DELTA_VERSION}.jar && \
wget https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VERSION}/delta-storage-${DELTA_VERSION}.jar && \
wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \
wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar

###########################################
# Stage 4: Final runtime image for K8s
###########################################
FROM spark-base AS final

# Set environment variables for PySpark
ENV PYSPARK_PYTHON=/usr/local/bin/python3.11
ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11
ENV PYTHONPATH=""
ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}"

# Install matching PySpark version and dependencies
RUN pip install --no-cache-dir \
pyspark==3.5.7 \
pandas \
numpy

# Create non-root user for running Spark (matches official image)
RUN groupadd -r -g 185 spark && \
useradd -r -u 185 -g 185 spark

# Create directory for Spark logs & local storage
RUN mkdir -p /opt/spark/work-dir && \
chown -R spark:spark /opt/spark

# Switch to non-root user
USER 185

WORKDIR /opt/spark/work-dir
RUN mkdir src

ENTRYPOINT ["/opt/entrypoint.sh"]
2 changes: 2 additions & 0 deletions spark-shell/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
**
!./*
25 changes: 25 additions & 0 deletions spark-shell/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM ubuntu:22.04
# Set Spark and Hadoop versions
ENV SPARK_VERSION=3.5.7
ENV HADOOP_VERSION=3
# Install dependencies
RUN apt-get update && apt-get install -y \
openjdk-11-jdk \
curl \
wget \
bash \
tini \
ca-certificates \
procps \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /opt
# Download and extract Spark
RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
# Set Spark bin in PATH
ENV PATH="/opt/spark/bin:${PATH}"
# Set entrypoint
ENTRYPOINT ["/usr/bin/tini", "--", "/bin/bash"]