1+ # syntax=docker/dockerfile:1
2+ # ##########################################
3+ # Stage 1: Build Python 3.11.6 from source
4+ # ##########################################
5+ FROM ubuntu:22.04 AS python-build
6+ ENV DEBIAN_FRONTEND=noninteractive
7+ ENV PYTHON_VERSION=3.11.6
8+ ENV PREFIX=/usr/local
9+ RUN apt-get update && apt-get install -y \
10+ build-essential \
11+ wget \
12+ zlib1g-dev \
13+ libncurses5-dev \
14+ libgdbm-dev \
15+ libnss3-dev \
16+ libssl-dev \
17+ libreadline-dev \
18+ libffi-dev \
19+ libsqlite3-dev \
20+ libbz2-dev \
21+ && rm -rf /var/lib/apt/lists/*
22+ WORKDIR /usr/src
23+ RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
24+ && tar -xzf Python-${PYTHON_VERSION}.tgz
25+ WORKDIR /usr/src/Python-${PYTHON_VERSION}
26+ RUN ./configure --enable-optimizations --prefix=${PREFIX} \
27+ && make -j"$(nproc)" \
28+ && make altinstall
29+ RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \
30+ && ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip
31+
32+ # ##########################################
33+ # Stage 2: Get entrypoint from official Spark
34+ # ##########################################
35+ FROM apache/spark:3.5.7 AS spark-official
36+
37+ # ##########################################
38+ # Stage 3: Spark + Delta + Cloud connectors
39+ # ##########################################
40+ FROM ubuntu:22.04 AS spark-base
41+ ARG SPARK_VERSION=3.5.7
42+ ARG HADOOP_VERSION=3
43+ ARG DELTA_VERSION=3.2.1
44+ ENV DEBIAN_FRONTEND=noninteractive
45+ ENV SPARK_HOME=/opt/spark
46+ ENV PATH=$SPARK_HOME/bin:$PATH
47+
48+ # Install Java + basic utilities
49+ RUN apt-get update && apt-get install -y \
50+ openjdk-11-jdk \
51+ curl \
52+ wget \
53+ bash \
54+ tini \
55+ ca-certificates \
56+ procps \
57+ && rm -rf /var/lib/apt/lists/*
58+
59+ # Copy compiled Python
60+ COPY --from=python-build /usr/local /usr/local
61+
62+ # Copy entrypoint script from official Spark image
63+ COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh
64+ COPY --from=spark-official /opt/decom.sh /opt/decom.sh
65+ RUN chmod +x /opt/entrypoint.sh /opt/decom.sh
66+
67+ # Download Apache Spark prebuilt for Hadoop 3
68+ WORKDIR /opt
69+ RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
70+ && tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
71+ && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
72+ && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
73+
74+ # Add useful connectors (Delta, AWS, Azure, MySQL)
75+ WORKDIR $SPARK_HOME/jars
76+ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
77+ wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \
78+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \
79+ wget https://repo1.maven.org/maven2/com/microsoft/azure/azure-storage/8.6.6/azure-storage-8.6.6.jar && \
80+ wget https://repo1.maven.org/maven2/com/azure/azure-storage-blob/12.24.0/azure-storage-blob-12.24.0.jar && \
81+ wget https://repo1.maven.org/maven2/com/azure/azure-identity/1.7.0/azure-identity-1.7.0.jar && \
82+ wget https://repo1.maven.org/maven2/com/azure/azure-core/1.42.0/azure-core-1.42.0.jar && \
83+ wget https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/${DELTA_VERSION}/delta-spark_2.12-${DELTA_VERSION}.jar && \
84+ wget https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VERSION}/delta-storage-${DELTA_VERSION}.jar && \
85+ wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \
86+ wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar
87+
88+ # ##########################################
89+ # Stage 4: Final runtime image for K8s
90+ # ##########################################
91+ FROM spark-base AS final
92+
93+ # Set environment variables for PySpark
94+ ENV PYSPARK_PYTHON=/usr/local/bin/python3.11
95+ ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11
96+ ENV PYTHONPATH=""
97+ ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}"
98+
99+ # Install matching PySpark version and dependencies
100+ RUN pip install --no-cache-dir \
101+ pyspark==3.5.7 \
102+ pandas \
103+ numpy
104+
105+ # Create non-root user for running Spark (matches official image)
106+ RUN groupadd -r -g 185 spark && \
107+ useradd -r -u 185 -g 185 spark
108+
109+ # Create directory for Spark logs & local storage
110+ RUN mkdir -p /opt/spark/work-dir && \
111+ chown -R spark:spark /opt/spark
112+
113+ # Switch to non-root user
114+ USER 185
115+
116+ WORKDIR /opt/spark/work-dir
117+ RUN mkdir src
118+
119+ ENTRYPOINT ["/opt/entrypoint.sh" ]
0 commit comments