File tree Expand file tree Collapse file tree 1 file changed +8
-1
lines changed
docs/software/communication Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Original file line number Diff line number Diff line change @@ -10,11 +10,18 @@ export NCCL_NET="AWS Libfabric"
1010# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-net-gdr-level-formerly-nccl-ib-gdr-level
1111export NCCL_NET_GDR_LEVEL=PHB
1212export NCCL_CROSS_NIC=1
13+ export NCCL_NET_FORCE_FLUSH=1
1314# These `FI` (libfabric) environment variables have been found to give the best
1415# performance on the Alps network across a wide range of applications. Specific
1516# applications may perform better with other values.
1617export FI_CXI_DEFAULT_CQ_SIZE=131072
1718export FI_CXI_DEFAULT_TX_SIZE=32768
1819export FI_CXI_DISABLE_HOST_REGISTER=1
19- export FI_CXI_RX_MATCH_MODE=software
20+ export FI_CXI_RDZV_EAGER_SIZE=0
21+ export FI_CXI_RDZV_GET_MIN=0
22+ export FI_CXI_RDZV_THRESHOLD=0
23+ export FI_CXI_RX_MATCH_MODE=hardware
24+ export FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD=16777216
25+ export FI_MR_CACHE_MAX_COUNT=524288
26+ export FI_MR_CACHE_MAX_SIZE=-1
2027export FI_MR_CACHE_MONITOR=userfaultfd
You can’t perform that action at this time.
0 commit comments