Skip to content

Commit 7feea80

Browse files
committed
PyTorch 1.8 (Python 3.8 & CUDA 11.1)
1 parent a408ade commit 7feea80

File tree

3 files changed

+45
-11
lines changed

3 files changed

+45
-11
lines changed

python-pytorch/Dockerfile.1.8-py38-cuda11.1

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
FROM lablup/common-base:py38-cuda11.1
2-
# Install PyTorch and MXNet
3-
ENV PYTORCH_VERSION=1.7.1
4-
ENV TORCHVISION_VERSION=0.8.2
5-
ENV TORCHAUDIO_VERSION=0.7.2
6-
ENV TORCHTEXT_VERSION=0.8.1
2+
3+
# Install PyTorch
4+
ENV PYTORCH_VERSION=1.8.1
5+
ENV TORCHVISION_VERSION=0.9.1
6+
ENV TORCHAUDIO_VERSION=0.8.1
7+
ENV TORCHTEXT_VERSION=0.9.1
78
ENV TENSORBOARDX_VERSION=2.1
9+
ENV MXNET_VERSION=1.7.0
810

911
RUN python3 -m pip uninstall -y torch && \
1012
python3 -m pip install --no-cache-dir \
11-
https://download.pytorch.org/whl/cu110/torch-${PYTORCH_VERSION}%2Bcu110-cp38-cp38-linux_x86_64.whl \
12-
https://download.pytorch.org/whl/cu110/torchvision-${TORCHVISION_VERSION}%2Bcu110-cp38-cp38-linux_x86_64.whl \
13+
https://download.pytorch.org/whl/cu111/torch-${PYTORCH_VERSION}%2Bcu111-cp38-cp38-linux_x86_64.whl \
14+
https://download.pytorch.org/whl/cu111/torchvision-${TORCHVISION_VERSION}%2Bcu111-cp38-cp38-linux_x86_64.whl \
1315
https://download.pytorch.org/whl/torchaudio-${TORCHAUDIO_VERSION}-cp38-cp38-linux_x86_64.whl \
16+
https://download.pytorch.org/whl/cu111/torchcsprng-0.2.1%2Bcu111-cp38-cp38-linux_x86_64.whl \
17+
https://download.pytorch.org/whl/torchserve-0.3.0-py2.py3-none-any.whl \
1418
https://download.pytorch.org/whl/torchtext-${TORCHTEXT_VERSION}-cp38-cp38-linux_x86_64.whl && \
1519
python3 -m pip install --no-cache-dir tensorboardX==${TENSORBOARDX_VERSION}
1620

@@ -28,23 +32,25 @@ RUN python3 -m pip install --extra-index-url \
2832
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
2933
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL \
3034
HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
31-
pip install --no-cache-dir horovod==0.21.1 && \
35+
pip install --no-cache-dir horovod==0.21.3 && \
3236
ldconfig
3337

3438
RUN python3 -m pip install --no-cache-dir \
3539
mpi4py==3.0.3 \
36-
mlflow==1.12.1 \
37-
nni==1.9 \
40+
mlflow==1.15.0 \
41+
nni==2.1 \
3842
scikit-nni==0.2.1
3943

4044
RUN apt autoclean && \
4145
rm -rf /var/lib/apt/lists/* && \
4246
rm -rf /root/.cache && \
4347
rm -rf /tmp/*
48+
4449
COPY ./service-defs /etc/backend.ai/service-defs
50+
COPY ./runner-scripts/bootstrap.sh runner-scripts/setup_multinode.py /opt/container/
4551

4652
# Install ipython kernelspec
47-
Run python3 -m ipykernel install --display-name "PyTorch 1.7.1 on Python 3.8 (CUDA 11.1)" && \
53+
Run python3 -m ipykernel install --display-name "PyTorch 1.8.1 on Python 3.8 (CUDA 11.1)" && \
4854
cat /usr/local/share/jupyter/kernels/python3/kernel.json
4955

5056
# Backend.AI specifics
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
BAI_MULTINODE_CONFIG_TF=$(/opt/backend.ai/bin/python /opt/container/setup_multinode.py)
3+
if [ -z "$BAI_MULTINODE_CONFIG_TF" ];
4+
then
5+
echo "";
6+
else
7+
echo ${BAI_MULTINODE_CONFIG_TF}
8+
export TF_CONFIG="${BAI_MULTINODE_CONFIG_TF}"
9+
fi
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
import json
3+
4+
if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup.
5+
env = {}
6+
env['cluster'] = {}
7+
env['cluster']['worker'] = []
8+
for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","):
9+
env['cluster']['worker'].append(container + ":2220")
10+
env['task'] = {}
11+
if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main':
12+
env['task']['type'] = "worker" # Was chief. but recent TF choose first worker as chief.
13+
env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0
14+
else:
15+
env['task']['type'] = "worker"
16+
env['task']["index"] = os.environ['BACKENDAI_CLUSTER_IDX']
17+
print(json.dumps(env))
18+
else:
19+
print("")

0 commit comments

Comments
 (0)