This repository was archived by the owner on Dec 9, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 21
Expand file tree
/
Copy pathDockerfile.perftest
More file actions
122 lines (105 loc) · 3.89 KB
/
Dockerfile.perftest
File metadata and controls
122 lines (105 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Stage 1: Builder
FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 AS builder
# Set environment variables for non-interactive apt-get installs
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
build-essential \
wget \
autoconf \
automake \
libtool \
pkg-config \
libibverbs-dev \
librdmacm-dev \
libibumad-dev \
libnl-3-dev \
libnl-route-3-dev \
libssl-dev \
libpci-dev \
libnuma-dev && \
rm -rf /var/lib/apt/lists/*
# Set environment variables for CUDA libraries in the builder stage.
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}"
ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}"
# --- Build linux-rdma/perftest ---
WORKDIR /usr/src
RUN git clone --depth 1 https://github.com/linux-rdma/perftest.git
WORKDIR /usr/src/perftest
RUN ./autogen.sh && \
./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=/usr/local --enable-cudart && \
make -j$(nproc) && \
make install
# --- Build openmpi ---
WORKDIR /usr/src
RUN wget -O- https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz | tar xzf -
WORKDIR /usr/src/openmpi-4.1.8
RUN ./configure --prefix=/opt/openmpi --with-cuda=/usr/local/cuda && \
make -j$(nproc) && \
make install
# --- Build nvidia/ncc-tests ---
WORKDIR /usr/src
RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git
WORKDIR /usr/src/nccl-tests
RUN make -j$(nproc) MPI=1 MPI_HOME=/opt/openmpi
# Stage 2: Runtime
FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS runtime
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
rdmacm-utils \
rdma-core \
iproute2 \
inetutils-ping \
ibverbs-utils \
libibverbs1 \
libmlx5-1 \
libnl-3-200 \
libnl-route-3-200 \
libpci3 \
libmnl0 \
libelf1 \
pciutils \
openssh-server \
openssh-client \
libcap2-bin \
libnuma1 && \
rm -rf /var/lib/apt/lists/*
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# https://github.com/kubeflow/mpi-operator/issues/580
ARG port=2222
# Add priviledge separation directoy to run sshd as root.
RUN mkdir -p /var/run/sshd
# Add capability to run sshd as non-root.
RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
# by disabling StrictHostKeyChecking.
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# Disabling StrictModes avoids directory and files read permission checks.
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& sed -i "s/[ #]\(.*Port \).*/ \1$port/g" /etc/ssh/ssh_config \
&& sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config \
&& sed -i "s/#\(Port \).*/\1$port/g" /etc/ssh/sshd_config
RUN useradd -m mpiuser
WORKDIR /home/mpiuser
# Configurations for running sshd as non-root.
RUN mkdir -p /home/mpiuser/.ssh && \
cat <<EOF > /home/mpiuser/sshd_config_custom
PidFile /home/mpiuser/sshd.pid
HostKey /home/mpiuser/.ssh/id_rsa
StrictModes no
EOF
RUN echo "Port $port" >> /home/mpiuser/.sshd_config
COPY --from=builder /usr/local/bin/ /usr/local/bin/
COPY --from=builder /opt/openmpi/ /opt/openmpi/
COPY --from=builder /usr/src/nccl-tests/build/*_perf /usr/local/bin/
# Set environment variables for CUDA libraries in the runtime stage.
ENV LD_LIBRARY_PATH="/opt/openmpi/lib/:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}"
ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}"
# Add the installation directory to the PATH for easy execution.
ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:${PATH}"
# Set the default command to run when the container starts.
CMD ["/bin/bash"]