Skip to content

Commit 066649f

Browse files
committed
Multi GPU benchmark based on NGC-Tensorflow 19.05 py27
1 parent 69f1624 commit 066649f

File tree

381 files changed

+48519
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

381 files changed

+48519
-0
lines changed

vendor/benchmark/Dockerfile

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
16+
FROM ${FROM_IMAGE_NAME}
17+
18+
# Install dependencies for system configuration logger
19+
RUN apt-get update && apt-get install -y --no-install-recommends \
20+
python2.7 \
21+
infiniband-diags \
22+
pciutils && \
23+
rm -rf /var/lib/apt/lists/*
24+
25+
# Install gsutil
26+
ENV CLOUDSDK_PYTHON /usr/bin/python2.7
27+
RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
28+
ENV PATH $PATH:/root/google-cloud-sdk/bin
29+
30+
# Install Python dependencies
31+
WORKDIR /opt/reinforcement
32+
33+
COPY minigo/requirements.txt requirements.txt
34+
RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
35+
&& pip install --no-cache-dir -r requirements.txt
36+
37+
# Copy MiniGo code and build
38+
COPY . .
39+
40+
ENV BOARD_SIZE 9
41+
42+
RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
43+
cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
44+
cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
45+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
46+
ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
47+
cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
48+
bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
49+
cp -r bazel-bin /opt/reinforcement/minigo
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
FROM nvcr.io/nvidia/tensorflow:19.05-py3
2+
# NVIDIA Tensorflow runs on Python 3.5
3+
4+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
5+
FROM ${FROM_IMAGE_NAME}
6+
7+
ENV PYTHONUNBUFFERED=1 \
8+
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:$LD_LIBRARY_PATH \
9+
PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH \
10+
LANG=C.UTF-8
11+
12+
# Install dependencies for system configuration logger
13+
RUN apt-get update && apt-get install -y --no-install-recommends \
14+
python2.7 \
15+
infiniband-diags \
16+
pciutils && \
17+
rm -rf /var/lib/apt/lists/*
18+
19+
# Install gsutil
20+
ENV CLOUDSDK_PYTHON /usr/bin/python2.7
21+
RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
22+
ENV PATH $PATH:/root/google-cloud-sdk/bin
23+
24+
# Install Python dependencies
25+
WORKDIR /opt/reinforcement
26+
27+
COPY minigo/requirements.txt requirements.txt
28+
RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
29+
&& pip install --no-cache-dir -r requirements.txt
30+
31+
# Copy MiniGo code and build
32+
COPY . .
33+
34+
ENV BOARD_SIZE 9
35+
36+
RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
37+
cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
38+
cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
39+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
40+
ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
41+
cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
42+
bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
43+
cp -r bazel-bin /opt/reinforcement/minigo
44+
45+
ENV PYTHONUNBUFFERED=1 \
46+
PATH=/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"$PATH \
47+
LANG=C.UTF-8
48+
49+
RUN /usr/bin/python3 -m pip install -U pip setuptools && \
50+
/usr/bin/python3 -m pip install --no-cache-dir ipython && \
51+
/usr/bin/python3 -m pip install --no-cache-dir pillow && \
52+
/usr/bin/python3 -m pip install --no-cache-dir h5py && \
53+
/usr/bin/python3 -m pip install --no-cache-dir jupyter
54+
55+
# Install ipython kernelspec
56+
RUN /usr/bin/python3 -m ipykernel install --display-name "benchmark MiniGo worked by NGC-Tensorflow 19.05 on Backend.AI" && \
57+
cat /usr/local/share/jupyter/kernels/python3/kernel.json
58+
59+
# Backend.AI specifics
60+
COPY policy.yml /etc/backend.ai/jail/policy.yml
61+
LABEL ai.backend.kernelspec="1" \
62+
ai.backend.envs.corecount="OPENBLAS_NUM_THREADS,OMP_NUM_THREADS,NPROC" \
63+
ai.backend.features="batch uid-match" \
64+
ai.backend.accelerators="cuda" \
65+
ai.backend.resource.min.cpu="1" \
66+
ai.backend.resource.min.mem="1g" \
67+
ai.backend.resource.min.cuda.device=1 \
68+
ai.backend.resource.min.cuda.shares=0.1 \
69+
ai.backend.base-distro="ubuntu16.04" \
70+
ai.backend.runtime-type="python" \
71+
ai.backend.runtime-path="/usr/bin/python3" \
72+
ai.backend.service-ports="ipython:pty:3000,tensorboard:http:6006,jupyter:http:8080"
73+
74+
# vim: ft=dockerfile
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
FROM nvcr.io/nvidia/tensorflow:19.05-py3
2+
# NVIDIA Tensorflow runs on Python 3.5
3+
4+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
5+
FROM ${FROM_IMAGE_NAME}
6+
7+
# Install dependencies for system configuration logger
8+
RUN apt-get update && apt-get install -y --no-install-recommends \
9+
python2.7 \
10+
infiniband-diags \
11+
pciutils && \
12+
rm -rf /var/lib/apt/lists/*
13+
14+
# Install gsutil
15+
ENV CLOUDSDK_PYTHON /usr/bin/python2.7
16+
RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
17+
ENV PATH $PATH:/root/google-cloud-sdk/bin
18+
19+
# Install Python dependencies
20+
WORKDIR /opt/reinforcement
21+
22+
COPY minigo/requirements.txt requirements.txt
23+
RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
24+
&& pip install --no-cache-dir -r requirements.txt
25+
26+
# Copy MiniGo code and build
27+
COPY . .
28+
29+
ENV BOARD_SIZE 9
30+
31+
RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
32+
cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
33+
cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
34+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
35+
ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
36+
cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
37+
bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
38+
cp -r bazel-bin /opt/reinforcement/minigo
39+
40+
ENV PYTHONUNBUFFERED=1 \
41+
PATH=/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"$PATH \
42+
LANG=C.UTF-8
43+
44+
RUN /usr/bin/python3 -m pip install -U pip setuptools && \
45+
/usr/bin/python3 -m pip install --no-cache-dir ipython && \
46+
/usr/bin/python3 -m pip install --no-cache-dir pillow && \
47+
/usr/bin/python3 -m pip install --no-cache-dir h5py && \
48+
/usr/bin/python3 -m pip install --no-cache-dir jupyter
49+
50+
# Install ipython kernelspec
51+
RUN /usr/bin/python3 -m ipykernel install --display-name "benchmark MiniGo worked by NGC-Tensorflow 19.05 on Backend.AI" && \
52+
cat /usr/local/share/jupyter/kernels/python3/kernel.json
53+
54+
# Backend.AI specifics
55+
COPY policy.yml /etc/backend.ai/jail/policy.yml
56+
LABEL ai.backend.kernelspec="1" \
57+
ai.backend.envs.corecount="OPENBLAS_NUM_THREADS,OMP_NUM_THREADS,NPROC" \
58+
ai.backend.features="batch uid-match" \
59+
ai.backend.accelerators="cuda" \
60+
ai.backend.resource.min.cpu="1" \
61+
ai.backend.resource.min.mem="1g" \
62+
ai.backend.resource.min.cuda.device=1 \
63+
ai.backend.resource.min.cuda.shares=0.1 \
64+
ai.backend.base-distro="ubuntu16.04" \
65+
ai.backend.runtime-type="python" \
66+
ai.backend.runtime-path="/usr/bin/python3" \
67+
ai.backend.service-ports="ipython:pty:3000,tensorboard:http:6006,jupyter:http:8080"
68+
69+
# vim: ft=dockerfile

vendor/benchmark/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# 1. Problem
2+
3+
This task benchmarks reinforcement learning for the 9x9 version of the boardgame go.
4+
The model plays games against itself and uses these games to improve play.
5+
6+
## Requirements
7+
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
8+
* [TensorFlow 19.05-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
9+
10+
# 2. Directions
11+
## Steps to download and verify data
12+
13+
All training data is generated during the selfplay phase of the RL loop.
14+
15+
The only data to be downloaded are the starting checkpoint and the target model. These are downloaded automatically
16+
before the training starts.
17+
18+
## Steps to launch training
19+
20+
### NVIDIA DGX-1 (single node)
21+
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
22+
single node submission are in the `config_DGX1.sh` script.
23+
24+
Steps required to launch single node training on NVIDIA DGX-1:
25+
26+
```
27+
docker build --pull -t mlperf-nvidia:minigo .
28+
LOGDIR=<path/to/output/dir> CONT=mlperf-nvidia:minigo DGXSYSTEM=DGX1 ./run.sub
29+
```
30+
31+
### NVIDIA DGX-1 (multi node)
32+
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
33+
multi node submission are in the `config_DGX1_multi.sh` script.
34+
35+
Steps required to launch multi node training on NVIDIA DGX-1:
36+
37+
1. Build the docker container and push to a docker registry
38+
```
39+
docker build --pull -t <docker/registry>/mlperf-nvidia:minigo.
40+
docker push <docker/registry>/mlperf-nvidia:minigo
41+
```
42+
43+
2. Launch the training
44+
```
45+
source config_DGX1_multi.sh && CONT="<docker/registry>/mlperf-nvidia:minigo" LOGDIR=<path/to/output/dir> DGXSYSTEM=DGX1_multi sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
46+
```
47+
48+
# 3. Model
49+
### Publication/Attribution
50+
51+
This benchmark is based on a fork of the minigo project (https://github.com/tensorflow/minigo); which is inspired by the work done by Deepmind with ["Mastering the Game of Go with Deep Neural Networks and
52+
Tree Search"](https://www.nature.com/articles/nature16961), ["Mastering the Game of Go without Human
53+
Knowledge"](https://www.nature.com/articles/nature24270), and ["Mastering Chess and Shogi by
54+
Self-Play with a General Reinforcement Learning
55+
Algorithm"](https://arxiv.org/abs/1712.01815). Note that minigo is an
56+
independent effort from AlphaGo, and that this fork is minigo is independent from minigo itself.
57+
58+
59+
### Reinforcement Setup
60+
61+
This benchmark includes both the environment and training for 9x9 go. There are three primary phases performed in each iteration:
62+
63+
- Selfplay: the *current best* model plays games against itself to produce board positions for training.
64+
- Training: train the neural networks selfplay data from several recent models.
65+
- Model Evaluation: the *current best* and the most recently trained model play a series of games to establish if the current model should replace the current best
66+
67+
Target evaluation is performed after completing the training (please see the Quality section below for more details).
68+
69+
### Structure
70+
71+
This task has a non-trivial network structure, including a search tree.
72+
A good overview of the structure can be found here: https://medium.com/applied-data-science/alphago-zero-explained-in-one-diagram-365f5abf67e0.
73+
74+
### Weight and bias initialization and Loss Function
75+
Network weights are initialized with a fixed checkpoint downloaded before the training starts. Loss function is described here;
76+
["Mastering the Game of Go with Deep Neural Networks and Tree Search"](https://www.nature.com/articles/nature16961)
77+
78+
### Optimizer
79+
We use a MomentumOptimizer to train the primary network.
80+
81+
# 4. Quality
82+
83+
### Quality metric
84+
Quality is measured by the number of games won out of 100 against a fixed target model.
85+
The target model is downloaded before automatically before the training starts.
86+
87+
### Quality target
88+
The target is to win at least 50 out of 100 games against the target model.
89+
90+
### Evaluation frequency
91+
Evaluations are performed after completing the training and are not timed.
92+
Checkpoints from every RL loop iteration are evaluated.

vendor/benchmark/config_DGX1.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
## DL params
4+
EXTRA_PARAMS=( )
5+
6+
## System run parms
7+
DGXNNODES=1
8+
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
9+
WALLTIME=6:00:00
10+
11+
## System config params
12+
DGXNGPU=8
13+
DGXSOCKETCORES=20
14+
DGXNSOCKET=2
15+
DGXHT=2 # HT is on is 2, HT off is 1
16+
DGXIBDEVICES=''

vendor/benchmark/config_DGX1_multi.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
## Environment variables for multi node runs
4+
export HOROVOD_CYCLE_TIME=0.1
5+
export HOROVOD_FUSION_THRESHOLD=67108864
6+
7+
## DL params
8+
EXTRA_PARAMS=( )
9+
10+
## System run parms
11+
DGXNNODES=3
12+
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
13+
WALLTIME=02:00:00
14+
15+
## System config params
16+
DGXNGPU=8
17+
DGXSOCKETCORES=20
18+
DGXNSOCKET=2
19+
DGXHT=2 # HT is on is 2, HT off is 1
20+
DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'

vendor/benchmark/minigo/.gitignore

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
lib
2+
lib64
3+
bin
4+
data
5+
*__pycache__
6+
pip-selfcheck.json
7+
*.pyc
8+
sgf
9+
pyvenv.cfg
10+
.DS_store
11+
logs/
12+
saved_models/
13+
14+
# Vim temp files
15+
*.swp
16+
*.swo
17+
*~
18+
19+
.mypy_cache
20+
21+
# Ignore any staging directory. We use this directory for docker-file creation.
22+
staging/
23+
24+
bazel-*
25+
cc/tensorflow/
26+
27+
minigui/static/*.js.map
28+
29+
cluster/cgos/cgosGtp*
30+
31+
.bazelrc

0 commit comments

Comments
 (0)