Skip to content

Commit 816b869

Browse files
authored
Merge pull request #1 from fccn/featurespark-shell-docker-image
Featurespark shell docker image
2 parents e947879 + 8752aea commit 816b869

File tree

6 files changed

+288
-0
lines changed

6 files changed

+288
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Build and Push Docker Image
2+
3+
on:
4+
push:
5+
branches:
6+
- '**'
7+
paths:
8+
- 'spark-base-image/**'
9+
workflow_dispatch:
10+
11+
jobs:
12+
build-and-push:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v4
18+
19+
- name: Set up Docker Buildx
20+
uses: docker/setup-buildx-action@v3
21+
22+
- name: Docker meta
23+
id: meta
24+
uses: docker/metadata-action@v5
25+
with:
26+
images: |
27+
nauedu/nau-analytics-base-spark
28+
tags: |
29+
type=sha
30+
31+
- name: Build Docker image (no push yet)
32+
uses: docker/build-push-action@v6
33+
with:
34+
context: ./
35+
file: ./spark-base-image/Dockerfile
36+
load: true
37+
tags: ${{ steps.meta.outputs.tags }}
38+
labels: ${{ steps.meta.outputs.labels }}
39+
40+
- name: Login to DockerHub
41+
if: ${{ github.event_name != 'pull_request' }}
42+
uses: docker/login-action@v3
43+
with:
44+
username: ${{ secrets.DOCKERHUB_USERNAME }}
45+
password: ${{ secrets.DOCKERHUB_TOKEN }}
46+
47+
- name: Push Docker image (branch-specific)
48+
if: ${{ github.event_name != 'pull_request' }}
49+
run: |
50+
set -eux
51+
IMAGE_NAME="nauedu/nau-analytics-base-spark"
52+
SHA_TAG=$(echo "${GITHUB_SHA}" | head -c7)
53+
54+
if [ "${GITHUB_REF_NAME}" = "main" ]; then
55+
# Tag and push with 'latest' and SHA
56+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:latest
57+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$SHA_TAG
58+
59+
docker push $IMAGE_NAME:latest
60+
docker push $IMAGE_NAME:$SHA_TAG
61+
62+
echo "✅ Pushed tags 'latest' and '$SHA_TAG' for main branch to https://hub.docker.com/r/$IMAGE_NAME/tags"
63+
else
64+
# Tag and push with branch name
65+
BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-')
66+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$BRANCH_TAG
67+
docker push $IMAGE_NAME:$BRANCH_TAG
68+
69+
echo "✅ Pushed branch tag '$BRANCH_TAG' to https://hub.docker.com/r/$IMAGE_NAME/tags"
70+
fi
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Build and Push Docker Image
2+
3+
on:
4+
push:
5+
branches:
6+
- '**'
7+
paths:
8+
- 'spark-shell/**'
9+
workflow_dispatch:
10+
11+
jobs:
12+
build-and-push:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v4
18+
19+
- name: Set up Docker Buildx
20+
uses: docker/setup-buildx-action@v3
21+
22+
- name: Docker meta
23+
id: meta
24+
uses: docker/metadata-action@v5
25+
with:
26+
images: |
27+
nauedu/nau-analytics-spark-shell
28+
tags: |
29+
type=sha
30+
31+
- name: Build Docker image (no push yet)
32+
uses: docker/build-push-action@v6
33+
with:
34+
context: ./
35+
file: ./spark-shell/Dockerfile
36+
load: true
37+
tags: ${{ steps.meta.outputs.tags }}
38+
labels: ${{ steps.meta.outputs.labels }}
39+
40+
- name: Login to DockerHub
41+
if: ${{ github.event_name != 'pull_request' }}
42+
uses: docker/login-action@v3
43+
with:
44+
username: ${{ secrets.DOCKERHUB_USERNAME }}
45+
password: ${{ secrets.DOCKERHUB_TOKEN }}
46+
47+
- name: Push Docker image (branch-specific)
48+
if: ${{ github.event_name != 'pull_request' }}
49+
run: |
50+
set -eux
51+
IMAGE_NAME="nauedu/nau-analytics-spark-shell"
52+
SHA_TAG=$(echo "${GITHUB_SHA}" | head -c7)
53+
54+
if [ "${GITHUB_REF_NAME}" = "main" ]; then
55+
# Tag and push with 'latest' and SHA
56+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:latest
57+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$SHA_TAG
58+
59+
docker push $IMAGE_NAME:latest
60+
docker push $IMAGE_NAME:$SHA_TAG
61+
62+
echo "✅ Pushed tags 'latest' and '$SHA_TAG' for main branch to https://hub.docker.com/r/$IMAGE_NAME/tags"
63+
else
64+
# Tag and push with branch name
65+
BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-')
66+
docker tag $IMAGE_NAME:sha-$SHA_TAG $IMAGE_NAME:$BRANCH_TAG
67+
docker push $IMAGE_NAME:$BRANCH_TAG
68+
69+
echo "✅ Pushed branch tag '$BRANCH_TAG' to https://hub.docker.com/r/$IMAGE_NAME/tags"
70+
fi

spark-base-image/.dockerignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
**
2+
!./*

spark-base-image/Dockerfile

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# syntax=docker/dockerfile:1
2+
###########################################
3+
# Stage 1: Build Python 3.11.6 from source
4+
###########################################
5+
FROM ubuntu:22.04 AS python-build
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
ENV PYTHON_VERSION=3.11.6
8+
ENV PREFIX=/usr/local
9+
RUN apt-get update && apt-get install -y \
10+
build-essential \
11+
wget \
12+
zlib1g-dev \
13+
libncurses5-dev \
14+
libgdbm-dev \
15+
libnss3-dev \
16+
libssl-dev \
17+
libreadline-dev \
18+
libffi-dev \
19+
libsqlite3-dev \
20+
libbz2-dev \
21+
&& rm -rf /var/lib/apt/lists/*
22+
WORKDIR /usr/src
23+
RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
24+
&& tar -xzf Python-${PYTHON_VERSION}.tgz
25+
WORKDIR /usr/src/Python-${PYTHON_VERSION}
26+
RUN ./configure --enable-optimizations --prefix=${PREFIX} \
27+
&& make -j"$(nproc)" \
28+
&& make altinstall
29+
RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \
30+
&& ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip
31+
32+
###########################################
33+
# Stage 2: Get entrypoint from official Spark
34+
###########################################
35+
FROM apache/spark:3.5.7 AS spark-official
36+
37+
###########################################
38+
# Stage 3: Spark + Delta + Cloud connectors
39+
###########################################
40+
FROM ubuntu:22.04 AS spark-base
41+
ARG SPARK_VERSION=3.5.7
42+
ARG HADOOP_VERSION=3
43+
ARG DELTA_VERSION=3.2.1
44+
ENV DEBIAN_FRONTEND=noninteractive
45+
ENV SPARK_HOME=/opt/spark
46+
ENV PATH=$SPARK_HOME/bin:$PATH
47+
48+
# Install Java + basic utilities
49+
RUN apt-get update && apt-get install -y \
50+
openjdk-11-jdk \
51+
curl \
52+
wget \
53+
bash \
54+
tini \
55+
ca-certificates \
56+
procps \
57+
&& rm -rf /var/lib/apt/lists/*
58+
59+
# Copy compiled Python
60+
COPY --from=python-build /usr/local /usr/local
61+
62+
# Copy entrypoint script from official Spark image
63+
COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh
64+
COPY --from=spark-official /opt/decom.sh /opt/decom.sh
65+
RUN chmod +x /opt/entrypoint.sh /opt/decom.sh
66+
67+
# Download Apache Spark prebuilt for Hadoop 3
68+
WORKDIR /opt
69+
RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
70+
&& tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
71+
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
72+
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
73+
74+
# Add useful connectors (Delta, AWS, Azure, MySQL)
75+
WORKDIR $SPARK_HOME/jars
76+
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
77+
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \
78+
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \
79+
wget https://repo1.maven.org/maven2/com/microsoft/azure/azure-storage/8.6.6/azure-storage-8.6.6.jar && \
80+
wget https://repo1.maven.org/maven2/com/azure/azure-storage-blob/12.24.0/azure-storage-blob-12.24.0.jar && \
81+
wget https://repo1.maven.org/maven2/com/azure/azure-identity/1.7.0/azure-identity-1.7.0.jar && \
82+
wget https://repo1.maven.org/maven2/com/azure/azure-core/1.42.0/azure-core-1.42.0.jar && \
83+
wget https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/${DELTA_VERSION}/delta-spark_2.12-${DELTA_VERSION}.jar && \
84+
wget https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VERSION}/delta-storage-${DELTA_VERSION}.jar && \
85+
wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \
86+
wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar
87+
88+
###########################################
89+
# Stage 4: Final runtime image for K8s
90+
###########################################
91+
FROM spark-base AS final
92+
93+
# Set environment variables for PySpark
94+
ENV PYSPARK_PYTHON=/usr/local/bin/python3.11
95+
ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11
96+
ENV PYTHONPATH=""
97+
ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}"
98+
99+
# Install matching PySpark version and dependencies
100+
RUN pip install --no-cache-dir \
101+
pyspark==3.5.7 \
102+
pandas \
103+
numpy
104+
105+
# Create non-root user for running Spark (matches official image)
106+
RUN groupadd -r -g 185 spark && \
107+
useradd -r -u 185 -g 185 spark
108+
109+
# Create directory for Spark logs & local storage
110+
RUN mkdir -p /opt/spark/work-dir && \
111+
chown -R spark:spark /opt/spark
112+
113+
# Switch to non-root user
114+
USER 185
115+
116+
WORKDIR /opt/spark/work-dir
117+
RUN mkdir src
118+
119+
ENTRYPOINT ["/opt/entrypoint.sh"]

spark-shell/.dockerignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
**
2+
!./*

spark-shell/Dockerfile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM ubuntu:22.04
2+
# Set Spark and Hadoop versions
3+
ENV SPARK_VERSION=3.5.7
4+
ENV HADOOP_VERSION=3
5+
# Install dependencies
6+
RUN apt-get update && apt-get install -y \
7+
openjdk-11-jdk \
8+
curl \
9+
wget \
10+
bash \
11+
tini \
12+
ca-certificates \
13+
procps \
14+
&& rm -rf /var/lib/apt/lists/*
15+
# Set working directory
16+
WORKDIR /opt
17+
# Download and extract Spark
18+
RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
19+
&& tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
20+
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
21+
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
22+
# Set Spark bin in PATH
23+
ENV PATH="/opt/spark/bin:${PATH}"
24+
# Set entrypoint
25+
ENTRYPOINT ["/usr/bin/tini", "--", "/bin/bash"]

0 commit comments

Comments
 (0)