1- # syntax=docker/dockerfile:1
2-
3- # ##########################################
4- # Stage 1: Build Python 3.11.6 from source
5- # ##########################################
6- FROM ubuntu:22.04 AS python-build
7-
8- ENV DEBIAN_FRONTEND=noninteractive
9- ENV PYTHON_VERSION=3.11.6
10- ENV PREFIX=/usr/local
11-
12- RUN apt-get update && apt-get install -y \
13- build-essential \
14- wget \
15- zlib1g-dev \
16- libncurses5-dev \
17- libgdbm-dev \
18- libnss3-dev \
19- libssl-dev \
20- libreadline-dev \
21- libffi-dev \
22- libsqlite3-dev \
23- libbz2-dev \
24- && rm -rf /var/lib/apt/lists/*
25-
26- WORKDIR /usr/src
27-
28- RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
29- && tar -xzf Python-${PYTHON_VERSION}.tgz
30-
31- WORKDIR /usr/src/Python-${PYTHON_VERSION}
32-
33- RUN ./configure --enable-optimizations --prefix=${PREFIX} \
34- && make -j"$(nproc)" \
35- && make altinstall
36-
37- RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \
38- && ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip
39-
40- # ##########################################
41- # Stage 2: Get entrypoint from official Spark
42- # ##########################################
43- FROM apache/spark:3.5.7 AS spark-official
44-
45- # ##########################################
46- # Stage 3: Spark + Delta + Cloud connectors
47- # ##########################################
48- FROM ubuntu:22.04 AS spark-base
49-
50- ARG SPARK_VERSION=3.5.7
51- ARG HADOOP_VERSION=3
52- ARG DELTA_VERSION=3.2.1
53-
54- ENV DEBIAN_FRONTEND=noninteractive
55- ENV SPARK_HOME=/opt/spark
56- ENV PATH="${SPARK_HOME}/bin:${PATH}"
57-
58- # Java + utils
59- RUN apt-get update && apt-get install -y \
60- openjdk-11-jdk \
61- curl \
62- wget \
63- bash \
64- tini \
65- ca-certificates \
66- procps \
67- && rm -rf /var/lib/apt/lists/*
68-
69- # Copy Python from build stage
70- COPY --from=python-build /usr/local /usr/local
71-
72- # Copy entrypoint scripts from official Spark image
73- COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh
74- COPY --from=spark-official /opt/decom.sh /opt/decom.sh
75- RUN chmod +x /opt/entrypoint.sh /opt/decom.sh
76-
77- # Download Apache Spark prebuilt for Hadoop 3
78- WORKDIR /opt
79- RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
80- && tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
81- && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
82- && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
83-
84- # Add connectors (Delta, AWS, Azure, MySQL)
85- WORKDIR ${SPARK_HOME}/jars
86- RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
87- wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \
88- wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \
89- wget https://repo1.maven.org/maven2/com/microsoft/azure/azure-storage/8.6.6/azure-storage-8.6.6.jar && \
90- wget https://repo1.maven.org/maven2/com/azure/azure-storage-blob/12.24.0/azure-storage-blob-12.24.0.jar && \
91- wget https://repo1.maven.org/maven2/com/azure/azure-identity/1.7.0/azure-identity-1.7.0.jar && \
92- wget https://repo1.maven.org/maven2/com/azure/azure-core/1.42.0/azure-core-1.42.0.jar && \
93- wget https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/${DELTA_VERSION}/delta-spark_2.12-${DELTA_VERSION}.jar && \
94- wget https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VERSION}/delta-storage-${DELTA_VERSION}.jar && \
95- wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \
96- wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar
97-
98- # ##########################################
99- # Stage 4: Final runtime image for K8s + Jupyter
100- # ##########################################
101- FROM spark-base AS final
102-
103- # Non-root user with home dir
104- RUN groupadd -r -g 185 spark && \
105- useradd -m -r -u 185 -g 185 -d /home/spark spark
1+ FROM nauedu/nau-analytics-base-spark:latest
1062
1073# Env for Jupyter + PySpark
1084ENV HOME=/home/spark \
@@ -114,9 +10,6 @@ ENV HOME=/home/spark \
11410
11511# PySpark + JupyterLab + libs
11612RUN pip install --no-cache-dir \
117- pyspark==3.5.7 \
118- pandas \
119- numpy \
12013 jupyterlab==4.2.5
12114
12215# Dirs Jupyter + notebooks
0 commit comments