-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile
More file actions
301 lines (248 loc) · 12.7 KB
/
Dockerfile
File metadata and controls
301 lines (248 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# PAWN — multi-GPU Docker image for RunPod and bare-metal workloads
#
# Built automatically by CI on merge to main and pushed to Docker Hub.
# All targets use python:3.12-slim as the base. PyTorch cu128 wheels
# bundle their own CUDA runtime; PyTorch ROCm wheels bundle their own
# ROCm/HIP libraries. No nvidia/cuda or rocm base image needed — the
# only host requirements are the GPU kernel drivers.
#
# Targets:
# runtime — CUDA production image (default)
# runtime-rocm — ROCm production image
# dev — CUDA dev image (non-root, Claude Code, tmux)
# dev-rocm — ROCm dev image
#
# Usage:
# docker build --target runtime -t pawn:latest .
# docker build --target runtime-rocm -t pawn:rocm .
# docker build --target dev -t pawn:dev .
# docker build --target dev-rocm -t pawn:dev-rocm .
#
# IMPORTANT: Always attach a network volume. Set HF_TOKEN as a pod env var.
# ── Caddy: single static binary for reverse-proxying the dashboard ──
FROM python:3.12-slim AS caddy
ARG CADDY_VERSION=2.11.2
ARG CADDY_SHA256=6d07b9bda92ac46e3b874e90dabc33192eca7e64c4b36ea661f4fd7dd27a5129
RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \
&& curl -fsSL "https://caddyserver.com/api/download?os=linux&arch=amd64&version=v${CADDY_VERSION}" \
-o /usr/local/bin/caddy \
&& echo "${CADDY_SHA256} /usr/local/bin/caddy" | sha256sum -c \
&& chmod +x /usr/local/bin/caddy \
&& rm -rf /var/lib/apt/lists/*
# ── Builder: compile Rust engine wheel ───────────────────────────────
FROM python:3.12-slim AS builder
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential pkg-config curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install --no-cache-dir maturin
# Cache Cargo dependency downloads — only re-fetched when Cargo.toml/lock change
WORKDIR /build/engine
COPY engine/Cargo.toml engine/Cargo.lock ./
# Stub out the expected source layout so Cargo can resolve the crate,
# then fetch dependencies into a cached layer. The real source files
# are copied in the next step — only Cargo.toml/lock changes trigger
# a re-download.
RUN mkdir -p src python/chess_engine && \
touch src/lib.rs python/chess_engine/__init__.py && \
cargo fetch
# Now copy actual source and build the wheel
COPY engine/pyproject.toml ./
COPY engine/src/ src/
COPY engine/python/ python/
RUN maturin build --release
# ═══════════════════════════════════════════════════════════════════════
# Shared deps base — everything before the GPU-specific uv sync
# ═══════════════════════════════════════════════════════════════════════
FROM python:3.12-slim AS deps-common
RUN apt-get update && apt-get install -y --no-install-recommends \
openssh-server tini \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /run/sshd
ENV PYTHONUNBUFFERED=1 \
UV_LINK_MODE=copy \
UV_CACHE_DIR=/tmp/uv-cache
WORKDIR /opt/pawn
COPY pyproject.toml uv.lock ./
COPY --from=builder /build/engine/target/wheels/*.whl /tmp/
# External binaries last — they don't depend on our layers, so placing
# them here avoids invalidating the layers above on a caddy/uv release.
COPY --from=caddy /usr/local/bin/caddy /usr/local/bin/caddy
COPY --from=ghcr.io/astral-sh/uv:0.10 /uv /uvx /bin/
# ═══════════════════════════════════════════════════════════════════════
# CUDA stages (--extra cu128)
# ═══════════════════════════════════════════════════════════════════════
# ── Deps (CUDA) ──────────────────────────────────────────────────────
FROM deps-common AS deps
RUN uv venv && \
uv sync --extra cu128 --no-dev --frozen --no-install-workspace && \
uv pip install /tmp/*.whl && rm -rf /tmp/*.whl ${UV_CACHE_DIR}
# ── Runtime (CUDA) ───────────────────────────────────────────────────
FROM deps AS runtime
COPY . .
ARG GIT_HASH=""
ARG GIT_TAG=""
ENV PAWN_GIT_HASH=${GIT_HASH} \
PAWN_GIT_TAG=${GIT_TAG} \
PYTHONPATH=/opt/pawn \
PATH="/opt/pawn/.venv/bin:${PATH}"
RUN chmod +x deploy/entrypoint.sh
EXPOSE 8888
ENTRYPOINT ["tini", "--"]
CMD ["/opt/pawn/deploy/entrypoint.sh"]
# ═══════════════════════════════════════════════════════════════════════
# ROCm stages (--extra rocm)
# Same python:3.12-slim base — the ROCm torch wheel (~2.8 GB) bundles
# HIP, rocBLAS, MIOpen, etc. inside the wheel itself.
# ═══════════════════════════════════════════════════════════════════════
# ── Deps (ROCm) ──────────────────────────────────────────────────────
FROM deps-common AS deps-rocm
RUN uv venv && \
uv sync --extra rocm --no-dev --frozen --no-install-workspace && \
uv pip install /tmp/*.whl && rm -rf /tmp/*.whl ${UV_CACHE_DIR}
# ── Runtime (ROCm) ───────────────────────────────────────────────────
FROM deps-rocm AS runtime-rocm
COPY . .
ARG GIT_HASH=""
ARG GIT_TAG=""
ENV PAWN_GIT_HASH=${GIT_HASH} \
PAWN_GIT_TAG=${GIT_TAG} \
PYTHONPATH=/opt/pawn \
PATH="/opt/pawn/.venv/bin:${PATH}"
RUN chmod +x deploy/entrypoint.sh
EXPOSE 8888
ENTRYPOINT ["tini", "--"]
CMD ["/opt/pawn/deploy/entrypoint.sh"]
# ═══════════════════════════════════════════════════════════════════════
# Shared dev base — dev tools, non-root user, Claude Code, tmux
# ═══════════════════════════════════════════════════════════════════════
FROM python:3.12-slim AS dev-common
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
openssh-server tini tmux ripgrep jq curl git \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /run/sshd
ENV PYTHONUNBUFFERED=1 \
UV_LINK_MODE=copy \
UV_CACHE_DIR=/tmp/uv-cache
# Developer-friendly tmux defaults
RUN cat <<'TMUX' > /etc/tmux.conf
set -g mouse on
set -g history-limit 50000
set -g default-terminal "tmux-256color"
set -g base-index 1
setw -g pane-base-index 1
set -g renumber-windows on
set -g set-clipboard on
TMUX
# Create non-root user
RUN useradd -m -s /bin/bash pawn && \
mkdir -p /opt/pawn && chown pawn:pawn /opt/pawn
# Normalize $HOME against /etc/passwd at login-shell start. Some container
# runtimes (vast.ai) bake HOME=/root into /etc/environment based on the
# container process env, which then leaks into every session via PAM's
# common-session + pam_env.so — clobbering the HOME that `su -` just reset
# for the target user. That breaks ~-relative paths for the pawn user and
# makes our pre-seeded HF token / claude config invisible. /etc/profile.d
# runs as part of /etc/profile, before bash looks for ~/.bash_profile, so
# this fix is picked up in time to repoint the dotfile search at the right
# home directory. No-op on hosts where $HOME is already correct.
COPY --chmod=755 <<'FIX_HOME' /etc/profile.d/10-fix-home.sh
#!/bin/sh
_u=$(id -un 2>/dev/null) || _u=""
if [ -n "$_u" ]; then
_h=$(getent passwd "$_u" 2>/dev/null | cut -d: -f6)
if [ -n "$_h" ] && [ "$HOME" != "$_h" ]; then
HOME="$_h"
export HOME
cd "$HOME" 2>/dev/null || true
fi
fi
unset _u _h
FIX_HOME
# Install Claude Code and Rust toolchain (for building the chess engine).
# BuildKit auto-updates $HOME based on the current USER's passwd entry at
# each RUN, so USER pawn gives HOME=/home/pawn without an explicit ENV.
# Do NOT set `ENV HOME=...` here: an explicit value becomes sticky and
# would propagate into child stages (dev, dev-rocm), breaking their
# USER pawn / uv sync step with "cannot create /root/.rustup: permission
# denied".
USER pawn
WORKDIR /home/pawn
RUN curl -fsSL https://claude.ai/install.sh | bash && \
{ test -x /home/pawn/.local/bin/claude \
|| test -x /home/pawn/.claude/local/claude \
|| { echo "claude install failed — binary not found" >&2; \
find /home/pawn -name claude 2>/dev/null; exit 1; }; }
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Expose claude system-wide so both root and pawn find it on PATH without
# relying on .bashrc aliases (which don't fire in non-interactive shells).
USER root
RUN set -e; \
for p in /home/pawn/.local/bin/claude /home/pawn/.claude/local/claude; do \
if [ -x "$p" ]; then ln -sf "$p" /usr/local/bin/claude; break; fi; \
done; \
test -x /usr/local/bin/claude
# Convenience script: drop into pawn user with claude in a tmux session
COPY --chmod=755 <<'CLAUDE_DEV' /usr/local/bin/claude-dev
#!/usr/bin/env bash
set -euo pipefail
SESSION="claude"
exec su - pawn -c "
if tmux has-session -t $SESSION 2>/dev/null; then
exec tmux attach -t $SESSION
fi
tmux new-session -d -s $SESSION -c /opt/pawn
tmux send-keys -t $SESSION 'cd /opt/pawn && claude --dangerously-skip-permissions' Enter
exec tmux attach -t $SESSION
"
CLAUDE_DEV
# External binaries last (same rationale as deps-common)
COPY --from=caddy /usr/local/bin/caddy /usr/local/bin/caddy
COPY --from=ghcr.io/astral-sh/uv:0.10 /uv /uvx /bin/
# ═══════════════════════════════════════════════════════════════════════
# Dev images — GPU deps + source code on top of dev-common
# Built independently from runtime/deps so every file in /opt/pawn
# enters via COPY --chown=pawn:pawn, avoiding a slow chown -R layer
# that would duplicate the multi-GB venv.
# ═══════════════════════════════════════════════════════════════════════
# ── Dev (CUDA) ───────────────────────────────────────────────────────
FROM dev-common AS dev
COPY --from=deps --chown=pawn:pawn /opt/pawn /opt/pawn
USER pawn
WORKDIR /opt/pawn
COPY --chown=pawn:pawn . .
# Build the engine so uv run doesn't trigger a rebuild on first use
RUN PATH="/home/pawn/.cargo/bin:${PATH}" \
uv sync --extra cu128 --frozen
ARG GIT_HASH=""
ARG GIT_TAG=""
ENV PAWN_GIT_HASH=${GIT_HASH} \
PAWN_GIT_TAG=${GIT_TAG} \
PYTHONPATH=/opt/pawn \
PATH="/home/pawn/.cargo/bin:/home/pawn/.local/bin:/opt/pawn/.venv/bin:${PATH}"
USER root
RUN chmod +x /opt/pawn/deploy/entrypoint-dev.sh /opt/pawn/deploy/entrypoint.sh
EXPOSE 8888
ENTRYPOINT ["tini", "--"]
CMD ["/opt/pawn/deploy/entrypoint-dev.sh"]
# ── Dev (ROCm) ───────────────────────────────────────────────────────
FROM dev-common AS dev-rocm
COPY --from=deps-rocm --chown=pawn:pawn /opt/pawn /opt/pawn
USER pawn
WORKDIR /opt/pawn
COPY --chown=pawn:pawn . .
# Build the engine so uv run doesn't trigger a rebuild on first use
RUN PATH="/home/pawn/.cargo/bin:${PATH}" \
uv sync --extra rocm --frozen
ARG GIT_HASH=""
ARG GIT_TAG=""
ENV PAWN_GIT_HASH=${GIT_HASH} \
PAWN_GIT_TAG=${GIT_TAG} \
PYTHONPATH=/opt/pawn \
PATH="/home/pawn/.cargo/bin:/home/pawn/.local/bin:/opt/pawn/.venv/bin:${PATH}"
USER root
RUN chmod +x /opt/pawn/deploy/entrypoint-dev.sh /opt/pawn/deploy/entrypoint.sh
EXPOSE 8888
ENTRYPOINT ["tini", "--"]
CMD ["/opt/pawn/deploy/entrypoint-dev.sh"]