lablup
diff --git a/‎vendor/benchmark/Dockerfile
Lines changed: 49 additions & 0 deletions b/‎vendor/benchmark/Dockerfile
Lines changed: 49 additions & 0 deletions
diff --git a/‎vendor/benchmark/Dockerfile.19.05-py3-minigo
Lines changed: 74 additions & 0 deletions b/‎vendor/benchmark/Dockerfile.19.05-py3-minigo
Lines changed: 74 additions & 0 deletions
diff --git a/‎vendor/benchmark/Dockerfile.19.05-py3-minigo~
Lines changed: 69 additions & 0 deletions b/‎vendor/benchmark/Dockerfile.19.05-py3-minigo~
Lines changed: 69 additions & 0 deletions
diff --git a/‎vendor/benchmark/README.md
Lines changed: 92 additions & 0 deletions b/‎vendor/benchmark/README.md
Lines changed: 92 additions & 0 deletions
diff --git a/‎vendor/benchmark/config_DGX1.sh
Lines changed: 16 additions & 0 deletions b/‎vendor/benchmark/config_DGX1.sh
Lines changed: 16 additions & 0 deletions
diff --git a/‎vendor/benchmark/config_DGX1_multi.sh
Lines changed: 20 additions & 0 deletions b/‎vendor/benchmark/config_DGX1_multi.sh
Lines changed: 20 additions & 0 deletions
diff --git a/‎vendor/benchmark/minigo/.gitignore
Lines changed: 31 additions & 0 deletions b/‎vendor/benchmark/minigo/.gitignore
Lines changed: 31 additions & 0 deletions
@@ -0,0 +1,49 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
+FROM ${FROM_IMAGE_NAME}
+
+# Install dependencies for system configuration logger
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python2.7 \
+        infiniband-diags \
+        pciutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install gsutil
+ENV CLOUDSDK_PYTHON /usr/bin/python2.7
+RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
+ENV PATH $PATH:/root/google-cloud-sdk/bin
+
+# Install Python dependencies
+WORKDIR /opt/reinforcement
+
+COPY minigo/requirements.txt requirements.txt
+RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
+ && pip install --no-cache-dir -r requirements.txt
+
+# Copy MiniGo code and build
+COPY . .
+
+ENV BOARD_SIZE 9
+
+RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
+    cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
+    cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
+    ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+    cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
+    cp -r bazel-bin /opt/reinforcement/minigo
@@ -0,0 +1,74 @@
+FROM nvcr.io/nvidia/tensorflow:19.05-py3
+# NVIDIA Tensorflow runs on Python 3.5
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
+FROM ${FROM_IMAGE_NAME}
+
+ENV PYTHONUNBUFFERED=1 \
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:$LD_LIBRARY_PATH \
+    PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH \
+    LANG=C.UTF-8
+
+# Install dependencies for system configuration logger
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python2.7 \
+        infiniband-diags \
+        pciutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install gsutil
+ENV CLOUDSDK_PYTHON /usr/bin/python2.7
+RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
+ENV PATH $PATH:/root/google-cloud-sdk/bin
+
+# Install Python dependencies
+WORKDIR /opt/reinforcement
+
+COPY minigo/requirements.txt requirements.txt
+RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
+ && pip install --no-cache-dir -r requirements.txt
+
+# Copy MiniGo code and build
+COPY . .
+
+ENV BOARD_SIZE 9
+
+RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
+    cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
+    cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
+    ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+    cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
+    cp -r bazel-bin /opt/reinforcement/minigo
+
+ENV PYTHONUNBUFFERED=1 \
+    PATH=/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"$PATH \
+    LANG=C.UTF-8
+
+RUN /usr/bin/python3 -m pip install -U pip setuptools && \
+    /usr/bin/python3 -m pip install --no-cache-dir ipython && \
+    /usr/bin/python3 -m pip install --no-cache-dir pillow && \
+    /usr/bin/python3 -m pip install --no-cache-dir h5py && \
+    /usr/bin/python3 -m pip install --no-cache-dir jupyter
+
+# Install ipython kernelspec
+RUN /usr/bin/python3 -m ipykernel install --display-name "benchmark MiniGo worked by NGC-Tensorflow 19.05 on Backend.AI" && \
+    cat /usr/local/share/jupyter/kernels/python3/kernel.json
+
+# Backend.AI specifics
+COPY policy.yml /etc/backend.ai/jail/policy.yml
+LABEL ai.backend.kernelspec="1" \
+      ai.backend.envs.corecount="OPENBLAS_NUM_THREADS,OMP_NUM_THREADS,NPROC" \
+      ai.backend.features="batch uid-match" \
+      ai.backend.accelerators="cuda" \
+      ai.backend.resource.min.cpu="1" \
+      ai.backend.resource.min.mem="1g" \
+      ai.backend.resource.min.cuda.device=1 \
+      ai.backend.resource.min.cuda.shares=0.1 \
+      ai.backend.base-distro="ubuntu16.04" \
+      ai.backend.runtime-type="python" \
+      ai.backend.runtime-path="/usr/bin/python3" \
+      ai.backend.service-ports="ipython:pty:3000,tensorboard:http:6006,jupyter:http:8080"
+
+# vim: ft=dockerfile
@@ -0,0 +1,69 @@
+FROM nvcr.io/nvidia/tensorflow:19.05-py3
+# NVIDIA Tensorflow runs on Python 3.5
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.05-py3
+FROM ${FROM_IMAGE_NAME}
+
+# Install dependencies for system configuration logger
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python2.7 \
+        infiniband-diags \
+        pciutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install gsutil
+ENV CLOUDSDK_PYTHON /usr/bin/python2.7
+RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
+ENV PATH $PATH:/root/google-cloud-sdk/bin
+
+# Install Python dependencies
+WORKDIR /opt/reinforcement
+
+COPY minigo/requirements.txt requirements.txt
+RUN pip install --no-cache-dir https://github.com/mlperf/training/archive/6289993e1e9f0f5c4534336df83ff199bd0cdb75.zip#subdirectory=compliance \
+ && pip install --no-cache-dir -r requirements.txt
+
+# Copy MiniGo code and build
+COPY . .
+
+ENV BOARD_SIZE 9
+
+RUN cp -r /opt/reinforcement/minigo/cc /opt/tensorflow && \
+    cp -r /usr/local/lib/tensorflow /opt/tensorflow/cc && \
+    cp -r /usr/local/lib/python3.5/dist-packages/tensorflow/include/* /opt/tensorflow/cc/tensorflow && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs && \
+    ln -fs /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+    cd /opt/tensorflow && cp -r /usr/local/mpi cc/ && ./nvbuild.sh --python3.5 --configonly && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --define=gpu=1 --define=trt=1 --define=tf=1 --define=board_size=${BOARD_SIZE} cc:selfplay cc:eval cc:selfplay_mpi cc:eval_mpi && \
+    cp -r bazel-bin /opt/reinforcement/minigo
+
+ENV PYTHONUNBUFFERED=1 \
+    PATH=/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"$PATH \
+    LANG=C.UTF-8
+
+RUN /usr/bin/python3 -m pip install -U pip setuptools && \
+    /usr/bin/python3 -m pip install --no-cache-dir ipython && \
+    /usr/bin/python3 -m pip install --no-cache-dir pillow && \
+    /usr/bin/python3 -m pip install --no-cache-dir h5py && \
+    /usr/bin/python3 -m pip install --no-cache-dir jupyter
+
+# Install ipython kernelspec
+RUN /usr/bin/python3 -m ipykernel install --display-name "benchmark MiniGo worked by NGC-Tensorflow 19.05 on Backend.AI" && \
+    cat /usr/local/share/jupyter/kernels/python3/kernel.json
+
+# Backend.AI specifics
+COPY policy.yml /etc/backend.ai/jail/policy.yml
+LABEL ai.backend.kernelspec="1" \
+      ai.backend.envs.corecount="OPENBLAS_NUM_THREADS,OMP_NUM_THREADS,NPROC" \
+      ai.backend.features="batch uid-match" \
+      ai.backend.accelerators="cuda" \
+      ai.backend.resource.min.cpu="1" \
+      ai.backend.resource.min.mem="1g" \
+      ai.backend.resource.min.cuda.device=1 \
+      ai.backend.resource.min.cuda.shares=0.1 \
+      ai.backend.base-distro="ubuntu16.04" \
+      ai.backend.runtime-type="python" \
+      ai.backend.runtime-path="/usr/bin/python3" \
+      ai.backend.service-ports="ipython:pty:3000,tensorboard:http:6006,jupyter:http:8080"
+
+# vim: ft=dockerfile
@@ -0,0 +1,92 @@
+# 1. Problem
+
+This task benchmarks reinforcement learning for the 9x9 version of the boardgame go.
+The model plays games against itself and uses these games to improve play.
+
+## Requirements
+* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow 19.05-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+
+# 2. Directions
+## Steps to download and verify data
+
+All training data is generated during the selfplay phase of the RL loop.
+
+The only data to be downloaded are the starting checkpoint and the target model. These are downloaded automatically
+before the training starts.
+
+## Steps to launch training
+
+### NVIDIA DGX-1 (single node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
+single node submission are in the `config_DGX1.sh` script.
+
+Steps required to launch single node training on NVIDIA DGX-1:
+
+```
+docker build --pull -t mlperf-nvidia:minigo .
+LOGDIR=<path/to/output/dir> CONT=mlperf-nvidia:minigo DGXSYSTEM=DGX1 ./run.sub
+```
+
+### NVIDIA DGX-1 (multi node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
+multi node submission are in the `config_DGX1_multi.sh` script.
+
+Steps required to launch multi node training on NVIDIA DGX-1:
+
+1. Build the docker container and push to a docker registry
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:minigo.
+docker push <docker/registry>/mlperf-nvidia:minigo
+```
+
+2. Launch the training
+```
+source config_DGX1_multi.sh && CONT="<docker/registry>/mlperf-nvidia:minigo" LOGDIR=<path/to/output/dir> DGXSYSTEM=DGX1_multi sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
+```
+
+# 3. Model
+### Publication/Attribution
+
+This benchmark is based on a fork of the minigo project (https://github.com/tensorflow/minigo); which is inspired by the work done by Deepmind with ["Mastering the Game of Go with Deep Neural Networks and
+Tree Search"](https://www.nature.com/articles/nature16961), ["Mastering the Game of Go without Human
+Knowledge"](https://www.nature.com/articles/nature24270), and ["Mastering Chess and Shogi by
+Self-Play with a General Reinforcement Learning
+Algorithm"](https://arxiv.org/abs/1712.01815). Note that minigo is an
+independent effort from AlphaGo, and that this fork is minigo is independent from minigo itself. 
+
+
+### Reinforcement Setup
+
+This benchmark includes both the environment and training for 9x9 go. There are three primary phases performed in each iteration:
+
+ - Selfplay: the *current best* model plays games against itself to produce board positions for training.
+ - Training: train the neural networks selfplay data from several recent models. 
+ - Model Evaluation: the *current best* and the most recently trained model play a series of games to establish if the current model should replace the current best
+ 
+ Target evaluation is performed after completing the training (please see the Quality section below for more details).
+
+### Structure
+
+This task has a non-trivial network structure, including a search tree.
+A good overview of the structure can be found here: https://medium.com/applied-data-science/alphago-zero-explained-in-one-diagram-365f5abf67e0. 
+
+### Weight and bias initialization and Loss Function
+Network weights are initialized with a fixed checkpoint downloaded before the training starts. Loss function is described here;
+["Mastering the Game of Go with Deep Neural Networks and Tree Search"](https://www.nature.com/articles/nature16961)
+
+### Optimizer
+We use a MomentumOptimizer to train the primary network. 
+
+# 4. Quality
+
+### Quality metric
+Quality is measured by the number of games won out of 100 against a fixed target model.
+The target model is downloaded before automatically before the training starts.
+
+### Quality target
+The target is to win at least 50 out of 100 games against the target model.
+
+### Evaluation frequency
+Evaluations are performed after completing the training and are not timed.
+Checkpoints from every RL loop iteration are evaluated. 
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+## DL params
+EXTRA_PARAMS=( )
+
+## System run parms
+DGXNNODES=1
+DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME=6:00:00
+
+## System config params
+DGXNGPU=8
+DGXSOCKETCORES=20
+DGXNSOCKET=2
+DGXHT=2         # HT is on is 2, HT off is 1
+DGXIBDEVICES=''
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+## Environment variables for multi node runs
+export HOROVOD_CYCLE_TIME=0.1
+export HOROVOD_FUSION_THRESHOLD=67108864
+
+## DL params
+EXTRA_PARAMS=( )
+
+## System run parms
+DGXNNODES=3
+DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME=02:00:00
+
+## System config params
+DGXNGPU=8
+DGXSOCKETCORES=20
+DGXNSOCKET=2
+DGXHT=2         # HT is on is 2, HT off is 1
+DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'
@@ -0,0 +1,31 @@
+lib
+lib64
+bin
+data
+*__pycache__
+pip-selfcheck.json
+*.pyc
+sgf
+pyvenv.cfg
+.DS_store
+logs/
+saved_models/
+
+# Vim temp files
+*.swp
+*.swo
+*~
+
+.mypy_cache
+
+# Ignore any staging directory. We use this directory for docker-file creation.
+staging/
+
+bazel-*
+cc/tensorflow/
+
+minigui/static/*.js.map
+
+cluster/cgos/cgosGtp*
+
+.bazelrc