1+ # syntax=docker/dockerfile:1
2+
13ARG PYTHON_VERSION=3.13
24FROM python:$PYTHON_VERSION-slim-bookworm AS base
35
@@ -8,14 +10,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
810 krb5-user \
911 && rm -rf /var/lib/apt/lists/* /var/cache/*
1012
13+ RUN useradd syncmaster --create-home && \
14+ mkdir -p /home/syncmaster/.ivy2/cache && \
15+ mkdir -p /home/syncmaster/.ivy2/jars && \
16+ chown -R syncmaster:syncmaster /home/syncmaster
17+
1118WORKDIR /app
1219ENV PYTHONPATH=/app \
13- PATH="/app/.venv/bin:$PATH" \
14- PYTHONUNBUFFERED=1
15-
16- COPY ./docker/entrypoint_worker.sh /app/entrypoint.sh
17- RUN chmod +x /app/entrypoint.sh
18- ENTRYPOINT ["/app/entrypoint.sh"]
20+ PATH="/app/.venv/bin:$PATH"
1921
2022
2123FROM base AS builder
@@ -30,21 +32,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
3032
3133COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/bin/uv
3234
33- COPY ./pyproject.toml ./uv.lock ./
34- RUN --mount=type=cache,target=/root/.cache/uv \
35+ RUN --mount=type=bind,source=./pyproject.toml,target=/app/pyproject.toml \
36+ --mount=type=bind,source=./uv.lock,target=/app/uv.lock \
37+ --mount=type=cache,target=/root/.cache/uv \
3538 uv sync \
3639 --frozen \
3740 --no-install-project \
41+ --link-mode copy \
3842 --extra "worker" \
3943 --extra "kerberos" \
4044 --compile-bytecode
4145
4246
4347FROM builder AS ivy2_packages
4448
49+ RUN apt-get update && apt-get install -y --no-install-recommends \
50+ rsync \
51+ && rm -rf /var/lib/apt/lists/* /var/cache/*
52+
4553RUN --mount=type=bind,source=./syncmaster/worker/ivy2.py,target=/app/syncmaster/worker/ivy2.py \
4654 --mount=type=bind,source=./docker/download_ivy2_packages.py,target=/app/docker/download_ivy2_packages.py \
47- --mount=type=cache,target=/root/.ivy2/ \
55+ --mount=type=cache,target=/root/.ivy2 \
4856 # Try to download all dependencies at once.
4957 # If multiple packages depends on the same transitive dependency, Spark uses maximum version of this dependency.
5058 python /app/docker/download_ivy2_packages.py all && \
@@ -58,53 +66,70 @@ RUN --mount=type=bind,source=./syncmaster/worker/ivy2.py,target=/app/syncmaster/
5866 python /app/docker/download_ivy2_packages.py oracle && \
5967 python /app/docker/download_ivy2_packages.py mssql && \
6068 python /app/docker/download_ivy2_packages.py mysql && \
61- mkdir -p /home/syncmaster/.ivy2/ && \
62- cp --recursive /root/.ivy2/* /home/syncmaster/.ivy2/
63- # if someone uses custom worker image, they should download jars on their own
69+ mkdir -p /home/syncmaster/.ivy2/cache/ && \
70+ rsync \
71+ --archive \
72+ --times \
73+ --omit-dir-times \
74+ # ivydata-$version.properties contains download time, avoid copying it to prevent layer cache invalidation
75+ --exclude 'ivydata*.properties' \
76+ # ignored by Spark
77+ --exclude 'ivyreport*' \
78+ # do not copy ~/.ivy2/jars/$group.$artifact.jar, as these are the same files as in ~/.ivy2/cache/$group/$artifact/jars/
79+ /root/.ivy2/cache/ /home/syncmaster/.ivy2/cache/ && \
80+ # reset directory timestamps
81+ find /home/syncmaster/.ivy2/cache/ -type d -exec touch @0 {} \; && \
82+ # # custom Spark session function may download additional jars, so user have to own them, but not jars
83+ find /home/syncmaster/.ivy2/ -type d -exec chmod 777 {} \;
84+
85+ RUN mkdir -p /root && ln -s /home/syncmaster/.ivy2 /root/.ivy2
6486
6587
6688FROM base AS prod
6789
68- # Do not run production as root, to improve security.
69- # Also user does not own anything inside the image, including venv and source code.
70- RUN useradd syncmaster && \
71- mkdir -p /home/syncmaster /home/syncmaster/.ivy2 && \
72- chown -R syncmaster:syncmaster /home/syncmaster
73-
90+ # place python dependencies after .ivy2 because the latter are twice as heavy
7491COPY --from=builder /app/.venv/ /app/.venv/
75- # custom Spark session function may download different jars, so syncmaster have to own them
76- COPY --from=ivy2_packages --chown=syncmaster:syncmaster /home/syncmaster/.ivy2/ /home/syncmaster/.ivy2/
92+
93+ # using --link to make ~/.ivy2 a separated layer in docker image, not based on previous layers
94+ COPY --link --from=ivy2_packages /home/syncmaster/.ivy2/cache/ /home/syncmaster/.ivy2/cache/
7795# If someone needs to use worker image with root user, use the same jars
78- RUN mkdir -p /root && \
79- ln -s /home/syncmaster/.ivy2 /root/.ivy2
96+ RUN mkdir -p /root && ln -s /home/syncmaster/.ivy2 /root/.ivy2
8097
98+ COPY ./pyproject.toml ./uv.lock /app/syncmaster/
99+ COPY --chmod=755 ./docker/entrypoint_worker.sh /app/entrypoint.sh
81100COPY ./syncmaster/ /app/syncmaster/
82- RUN python -m compileall syncmaster
101+ RUN python -m compileall /app/syncmaster
102+ ENTRYPOINT ["/app/entrypoint.sh"]
103+ # Do not run production as root, to improve security.
104+ # Also user does not own anything inside the image, including venv and source code.
83105USER syncmaster
84106
85107
86108FROM ivy2_packages AS test
87109
88- RUN mkdir -p /root && \
89- ln -s /home/syncmaster/.ivy2 /root/.ivy2
90-
91- RUN --mount=type=cache,target=/root/.cache/uv \
110+ RUN --mount=type=bind,source=./pyproject.toml,target=/app/pyproject.toml \
111+ --mount=type=bind,source=./uv.lock,target=/app/uv.lock \
112+ --mount=type=cache,target=/root/.cache/uv \
92113 uv sync \
93114 --frozen \
94115 --no-install-project \
116+ --link-mode copy \
95117 # CI runs tests in the worker container,
96118 # so we need server & scheduler dependencies too
97119 --all-extras \
98120 --group "test" \
99121 --compile-bytecode
100122
101- ENV SYNCMASTER__WORKER__CREATE_SPARK_SESSION_FUNCTION=tests.spark.get_worker_spark_session
102-
103- # Collect coverage from worker
123+ COPY ./pyproject.toml ./uv.lock /app/syncmaster/
124+ COPY --chmod=755 ./docker/entrypoint_worker.sh /app/entrypoint.sh
104125RUN sed -i 's/python -m/coverage run -m/g' /app/entrypoint.sh
126+ ENTRYPOINT ["/app/entrypoint.sh"]
105127
106128# Replace kinit binary with dummy, to skip Kerberos interaction in tests
107129RUN mkdir -p /app/.local/bin && \
108130 echo "#!/bin/bash" > /app/.local/bin/kinit \
109131 && chmod +x /app/.local/bin/kinit
110132ENV PATH="/app/.local/bin:$PATH"
133+
134+ # use custom Spark session factory
135+ ENV SYNCMASTER__WORKER__CREATE_SPARK_SESSION_FUNCTION=tests.spark.get_worker_spark_session
0 commit comments