Skip to content

Commit 0d2808e

Browse files
authored
Merge pull request #39 from NVIDIA/update-base-image
Replace base image with CUDA image
2 parents 3630482 + e9d7d1a commit 0d2808e

File tree

3 files changed

+60
-28
lines changed

3 files changed

+60
-28
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ After building, verify the image is accessible in Docker:
120120

121121
```bash
122122
$ docker images | grep nvidia/bobber
123-
nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB
123+
nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB
124124
```
125125

126126
## Save container
@@ -153,7 +153,7 @@ scp -r nvidia_bobber_{version}.tar user@test-machine-3:~/bobber
153153

154154
Do this for each host you intend to include in the test. A bash `for` loop to
155155
can be used to iterate over all systems - you could also target the high
156-
performance network to speed up the copy further (this is a 10+ GB copy). Like
156+
performance network to speed up the copy further (this is a 5+ GB copy). Like
157157
so:
158158

159159
```bash
@@ -170,7 +170,7 @@ On all other nodes, load the copied Docker image.
170170
```bash
171171
$ docker load < nvidia_bobber_{version}.tar
172172
$ docker images | grep bobber
173-
nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB
173+
nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB
174174
```
175175

176176
## Ensure shared filesystem is mounted, if necessary
@@ -219,7 +219,7 @@ To verify the container is running, use `docker ps`:
219219
```bash
220220
$ docker ps
221221
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
222-
317b6cf928f8 c697a75ee482 "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber
222+
317b6cf928f8 8e545fee7a4d "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber
223223
```
224224

225225
## Create log dir on primary test system

bobber/lib/analysis/fio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def fio_iops_results(log_contents: str, systems: int, string_to_match: str,
155155
return []
156156
for result in match:
157157
iops = re.findall(r'[-+]?\d*\.\d+[kMG]|\d+[kMG]|\d+', result)
158-
if len(iops) != 5:
158+
if len(iops) not in [5, 6]:
159159
raise ValueError('IOPS cannot be parsed from FIO log!')
160160
iops = clean_iops(iops[0])
161161
final_iops.append(iops)

bobber/lib/docker/Dockerfile

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# SPDX-License-Identifier: MIT
2-
FROM nvcr.io/nvidia/tensorflow:20.11-tf2-py3
2+
# Larger base stage with required items for building various tools
3+
FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04 as build
34

45
ENV DEBIAN_FRONTEND=noninteractive
56

7+
# Install all required build dependencies
68
RUN apt-get update && apt-get -y install apt-utils && rm -rf /var/lib/apt/lists/*
79
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
8-
openssh-client \
9-
openssh-server \
1010
swig \
1111
bison \
12-
libgfortran3 \
12+
gcc \
13+
libgfortran4 \
1314
pkg-config \
1415
autotools-dev \
1516
debhelper \
@@ -42,8 +43,46 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
4243
kmod \
4344
libnuma1 \
4445
lsof \
46+
libopenmpi-dev && \
47+
rm -rf /var/lib/apt/lists/*
48+
49+
# Compile NVIDIA's NCCL tests
50+
RUN git clone https://github.com/NVIDIA/nccl-tests && \
51+
cd nccl-tests/ && \
52+
git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \
53+
make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi
54+
55+
# Compile OSU microbenchmarks
56+
RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \
57+
tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \
58+
cd osu-micro-benchmarks-5.6.2 && \
59+
./configure CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \
60+
make && \
61+
make install && \
62+
rm -rf ../*.tar.gz
63+
64+
# Build IO500, IOR, and mdtest
65+
RUN git clone https://github.com/jyvet/io-500-dev && \
66+
cd io-500-dev && \
67+
git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \
68+
utilities/prepare.sh
69+
70+
# Lighter runtime stage copying only necessary build artifacts from earlier
71+
FROM nvcr.io/nvidia/cuda:11.2.0-runtime-ubuntu20.04
72+
73+
ENV DEBIAN_FRONTEND=noninteractive
74+
75+
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
76+
openssh-client \
77+
openssh-server \
78+
git \
4579
fio \
4680
psmisc \
81+
libopenmpi-dev \
82+
openmpi-bin \
83+
python \
84+
python3-dev \
85+
python3-pip \
4786
python3-distutils && \
4887
rm -rf /var/lib/apt/lists/*
4988

@@ -65,32 +104,25 @@ RUN mkdir -p /var/run/sshd && \
65104

66105
WORKDIR /
67106

68-
RUN git clone https://github.com/NVIDIA/nccl-tests && \
69-
cd nccl-tests/ && \
70-
git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \
71-
make MPI=1 MPI_HOME=/usr/local/mpi
107+
# Copy the compiled nccl-tests binaries to the runtime image
108+
COPY --from=build /nccl-tests/build /nccl-tests/build
72109

73-
RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \
74-
tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \
75-
cd osu-micro-benchmarks-5.6.2 && \
76-
./configure CC=/usr/local/mpi/bin/mpicc CXX=/usr/local/mpi/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \
77-
make && \
78-
make install && \
79-
rm -rf ../*.tar.gz
80-
81-
RUN python3 -m pip install nvidia-pyindex && \
82-
python3 -m pip install \
83-
nvidia-imageinary['mxnet']>=1.1.2
110+
# Copy the compiled OSU microbenchmarks to the runtime image
111+
COPY --from=build /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ /usr/local/libexec/osu-micro-benchmarks/mpi/collective/
84112

85-
RUN git clone https://github.com/jyvet/io-500-dev && \
86-
cd io-500-dev && \
87-
git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \
88-
utilities/prepare.sh
113+
# Copy the compiled IO500 binaries to the runtime image
114+
COPY --from=build /io-500-dev/bin /io-500-dev/bin
89115

90116
RUN git clone https://github.com/NVIDIA/DALI dali && \
91117
cd dali/ && \
92118
git reset --hard fd30786d773d08185d78988b2903dce2ace0a00b
93119

120+
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools && \
121+
python3 -m pip install --no-cache-dir nvidia-pyindex && \
122+
python3 -m pip install --no-cache-dir \
123+
nvidia-imageinary['tfrecord']>=1.1.2 \
124+
nvidia-dali-cuda110
125+
94126
COPY test_scripts /tests/
95127

96128
EXPOSE 2222

0 commit comments

Comments
 (0)