File tree Expand file tree Collapse file tree 2 files changed +14
-2
lines changed Expand file tree Collapse file tree 2 files changed +14
-2
lines changed Original file line number Diff line number Diff line change 2
2
3
3
set -ex
4
4
5
+ function check_libnccl_net_so {
6
+ OFI_LIB_DIR=" /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu"
7
+ NCCL_NET_SO=" $OFI_LIB_DIR /libnccl-net.so"
8
+
9
+ # Check if file exists
10
+ if [ ! -f " $NCCL_NET_SO " ]; then
11
+ echo " ERROR: $NCCL_NET_SO does not exist"
12
+ return 1
13
+ fi
14
+ }
15
+
5
16
function install_efa {
6
17
EFA_VERSION=$1
7
18
OPEN_MPI_PATH=" /opt/amazon/openmpi"
@@ -31,7 +42,7 @@ function install_efa {
31
42
echo " rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH} /etc/openmpi-mca-params.conf
32
43
echo NCCL_DEBUG=INFO >> /etc/nccl.conf
33
44
echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
34
-
45
+
35
46
# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
36
47
apt-get install -y --no-install-recommends \
37
48
openssh-client \
@@ -61,6 +72,7 @@ function install_efa {
61
72
apt-get autoremove -y
62
73
rm -rf /var/lib/apt/lists/*
63
74
ldconfig
75
+ check_libnccl_net_so
64
76
}
65
77
66
78
# idiomatic parameter and option handling in sh
Original file line number Diff line number Diff line change @@ -12,7 +12,7 @@ ENV DEBIAN_FRONTEND=noninteractive \
12
12
PYTHONDONTWRITEBYTECODE=1 \
13
13
PYTHONUNBUFFERED=1 \
14
14
PYTHONIOENCODING=UTF-8 \
15
- LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
15
+ LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu :/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
16
16
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
17
17
18
18
WORKDIR /
You can’t perform that action at this time.
0 commit comments