diff --git a/docs/software/communication/nccl_env_vars b/docs/software/communication/nccl_env_vars index f2843c28..9818f9d8 100644 --- a/docs/software/communication/nccl_env_vars +++ b/docs/software/communication/nccl_env_vars @@ -10,11 +10,18 @@ export NCCL_NET="AWS Libfabric" # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-net-gdr-level-formerly-nccl-ib-gdr-level export NCCL_NET_GDR_LEVEL=PHB export NCCL_CROSS_NIC=1 +export NCCL_NET_FORCE_FLUSH=1 # These `FI` (libfabric) environment variables have been found to give the best # performance on the Alps network across a wide range of applications. Specific # applications may perform better with other values. export FI_CXI_DEFAULT_CQ_SIZE=131072 export FI_CXI_DEFAULT_TX_SIZE=32768 export FI_CXI_DISABLE_HOST_REGISTER=1 -export FI_CXI_RX_MATCH_MODE=software +export FI_CXI_RDZV_EAGER_SIZE=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RX_MATCH_MODE=hardware +export FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD=16777216 +export FI_MR_CACHE_MAX_COUNT=524288 +export FI_MR_CACHE_MAX_SIZE=-1 export FI_MR_CACHE_MONITOR=userfaultfd