Skip to content

Commit 3622cc2

Browse files
DevakiBolleneniDevakiBolleneni
andauthored
update EFA version in cu130 base image (#5468)
* update EFA version in base image * rebuild after efa version update * renamed the NCCL OFI plugin library * fix NCCL OFI plugin library issue * temporarily disable NCCL check * Enable NCCL check and build * Enable efa version based NCCL check * fix typo * test build for cuda129 * revert back toml file --------- Co-authored-by: DevakiBolleneni <[email protected]>
1 parent fd8fd7f commit 3622cc2

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ARG PYTHON_VERSION="3.12.10"
33
ARG PYTHON_SHORT_VERSION="3.12"
44
ARG CUDA_MAJOR="13"
55
ARG CUDA_MINOR="0"
6-
ARG EFA_VERSION="1.43.3"
6+
ARG EFA_VERSION="1.44.0"
77
ARG OS_VERSION="ubuntu22.04"
88
FROM nvidia/cuda:13.0.0-base-${OS_VERSION} AS base-builder
99

dlc_developer_config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ deep_canary_mode = false
3636

3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
39-
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
39+
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
4040
build_frameworks = []
4141

4242

scripts/install_efa.sh

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,26 @@ case $ARCH in
1717
esac
1818

1919
function check_libnccl_net_so {
20-
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
21-
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"
20+
21+
if [[ "$EFA_VERSION" > "1.44.0" ]] || [[ "$EFA_VERSION" == "1.44.0" ]]; then # version threshold
22+
# Newer EFA version - no ARCH_DIR, different filename
23+
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/"
24+
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so"
25+
echo "Using newer EFA path structure"
26+
else
27+
# Older EFA version - uses ARCH_DIR
28+
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
29+
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"
30+
echo "Using older EFA path structure with ARCH_DIR: $ARCH_DIR"
31+
fi
2232

2333
# Check if file exists
2434
if [ ! -f "$NCCL_NET_SO" ]; then
2535
echo "ERROR: $NCCL_NET_SO does not exist"
2636
return 1
37+
else
38+
echo "NCCL OFI plugin found at: $NCCL_NET_SO"
39+
return 0
2740
fi
2841
}
2942

0 commit comments

Comments
 (0)