diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/distcheck.yaml index 62f34ca83a..30e0da3f29 100644 --- a/.github/workflows/distcheck.yaml +++ b/.github/workflows/distcheck.yaml @@ -38,6 +38,10 @@ env: # environment variable is also now needed, as of july 2024. # ref: https://github.com/actions/runner/issues/2906#issuecomment-2208546951 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: 'true' + + # APT package list for caching + APT_CACHE_RESTORE_KEYS: | + apt-cache-ubuntu-${{ github.job }} concurrency: group: ${{ github.head_ref || github.run_id }} @@ -190,19 +194,51 @@ jobs: sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 10 fi + - name: Setup APT Cache for Distcheck + id: apt-cache-distcheck + uses: actions/cache@v3 + with: + path: /var/cache/apt/archives + key: apt-cache-ubuntu-distcheck-${{ hashFiles('**/workflows/distcheck.yaml') }} + restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }} + - name: Install Base Dependencies run: | - sudo apt-get update -y + # Download packages without installing if they're not in cache + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true + # Install from cache sudo apt-get install -y ${{ env.APT_PACKAGES }} + - name: Setup CUDA APT Cache + id: cuda-apt-cache-distcheck + if: matrix.sdk == 'cuda' + uses: actions/cache@v3 + with: + path: /var/cache/apt/archives + key: cuda-apt-cache-distcheck-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }} + restore-keys: | + cuda-apt-cache-ubuntu- + - name: Install CUDA SDK if: matrix.sdk == 'cuda' run: | - sudo apt-get update -y && sudo apt-get install -y wget lsb-release + # Install with retry logic + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release) repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')" - wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb + for i in {1..3}; do + if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then + break + fi + echo "Retrying wget in 10 seconds..." + sleep 10 + done sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update -y + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + # Download packages without installing if not in cache + sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true + # Install from cache sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6 - name: Install lttng @@ -281,24 +317,58 @@ jobs: - uses: actions/setup-python@v5 with: python-version: '3.9' + + - name: Setup APT Cache for CodeChecker + id: apt-cache-codechecker + uses: actions/cache@v3 + with: + path: /var/cache/apt/archives + key: apt-cache-ubuntu-codechecker-${{ hashFiles('**/workflows/distcheck.yaml') }} + restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }} - name: Install Base Dependencies run: | - sudo apt-get update -y + # Download packages without installing if they're not in cache + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true + # Install from cache sudo apt-get install -y ${{ env.APT_PACKAGES }} + - name: Setup CUDA APT Cache + id: cuda-apt-cache-codechecker + if: matrix.sdk == 'cuda' + uses: actions/cache@v3 + with: + path: /var/cache/apt/archives + key: cuda-apt-cache-codechecker-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }} + restore-keys: | + cuda-apt-cache-ubuntu- + - name: Install CUDA SDK if: matrix.sdk == 'cuda' run: | - sudo apt-get update -y && sudo apt-get install -y wget lsb-release + # Install with retry logic + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release) repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')" - wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb + for i in {1..3}; do + if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then + break + fi + echo "Retrying wget in 10 seconds..." + sleep 10 + done sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update -y + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + # Download packages without installing if not in cache + sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true + # Install from cache sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6 - name: Install cppcheck run: | + sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y) + sudo apt-get install -y --download-only cppcheck || true sudo apt-get install -y cppcheck - name: Fetch and Install EFA Installer Dependencies diff --git a/.github/workflows/tag-makedist.yaml b/.github/workflows/tag-makedist.yaml index 43dbf6c33c..c9430df4f3 100644 --- a/.github/workflows/tag-makedist.yaml +++ b/.github/workflows/tag-makedist.yaml @@ -6,6 +6,11 @@ on: - release/** - v** +env: + # For caching APT packages + APT_CACHE_RESTORE_KEYS: | + apt-cache-ubuntu-makedist + jobs: amazonlinux: strategy: @@ -29,27 +34,77 @@ jobs: container: ${{ matrix.container }} name: make dist for tag ${{ github.ref_name }} steps: + - name: Setup YUM Cache + uses: actions/cache@v3 + id: yum-cache + with: + path: /var/cache/yum + key: yum-cache-ubuntu-${{ hashFiles('**/workflows/tag-makedist.yaml') }} + restore-keys: | + yum-cache-ubuntu- + - run: | - yum -y update && yum -y install git tar util-linux findutils yum-utils + # Configure YUM to keep cache + echo 'keepcache=1' | tee -a /etc/yum.conf + # Add retry logic + for i in {1..3}; do + if yum -y update && yum -y install git tar util-linux findutils yum-utils; then + break + fi + echo "Retrying yum in 10 seconds..." + sleep 10 + done - uses: actions/checkout@v4 - name: Fetch and Install EFA Installer Dependencies run: | - curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz + # Add retry logic for curl + for i in {1..3}; do + if curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz; then + break + fi + echo "Retrying curl in 10 seconds..." + sleep 10 + done tar -xf aws-efa-installer-*.tar.gz + # Install EFA RPMs from local directory to avoid network issues ( cd aws-efa-installer/RPMS/${{ matrix.efainstallerdir }}/x86_64 ; find . | grep rpm$ | xargs yum -y localinstall ) rm -rf aws-efa-installer* - name: Install hwloc, utilities. run: | - yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make + # Add retry logic for package installation + for i in {1..3}; do + if yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make; then + break + fi + echo "Retrying yum in 10 seconds..." + sleep 10 + done - name: Install CUDA run: | - ${{ matrix.configmanager }} --add-repo \ - http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \ - --save + # Add retry logic for repo config + for i in {1..3}; do + if ${{ matrix.configmanager }} --add-repo \ + http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \ + --save; then + break + fi + echo "Retrying repo add in 10 seconds..." + sleep 10 + done + + # Clean and refresh cache yum -y clean expire-cache - yum -y install ${{ matrix.cudapackages }} + + # Add retry logic for package installation + for i in {1..3}; do + if yum -y install ${{ matrix.cudapackages }}; then + break + fi + echo "Retrying CUDA package installation in 10 seconds..." + sleep 10 + done - name: Call `autoreconf -ivf` run: | diff --git a/contrib/scripts/generate_debian_changelog.sh b/contrib/scripts/generate_debian_changelog.sh index ea46069964..22049e3311 100755 --- a/contrib/scripts/generate_debian_changelog.sh +++ b/contrib/scripts/generate_debian_changelog.sh @@ -1,5 +1,15 @@ #!/bin/bash +fmt=" \ +tagname=%(refname:short) \ +tagger_name=%(taggername:mailmap) \ +tagger_email=%(taggeremail:mailmap) \ +tagger_when=%(taggerdate) \ +committer_name=%(committername:mailmap) \ +committer_email=%(committeremail:mailmap) \ +committer_when=%(committerdate) \ +" + if [ $# -eq 0 ]; then tag=HEAD else @@ -12,8 +22,7 @@ if [ $? -ne 0 ]; then exit 0 fi -git log --use-mailmap --no-walk --format="tagname='%D' tagger_name='%aN' tagger_email='<%aE>' tagger_when='%ad' committer_name='%cN' committer_email='<%cE>' committer_when='%cd'" \ - --date="format:%a %b %-d %T %Y %z" $(git tag --merged "$tag") | sed "s/tagname='tag: \([^,']*\)'/tagname='\1'/" | { +git for-each-ref --shell --sort=-v:refname --format "$fmt" --merged "$tag" | { while read line; do eval $line (echo ${tagname} | grep -qE '^v[0-9]') || continue diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index ebdcd6f38a..b01759d0ed 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -558,7 +558,7 @@ struct nccl_net_ofi_send_comm { */ int (*deregMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_net_ofi_mr_handle_t *mhandle); - int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag, + int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag, nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **req); int (*close)(nccl_net_ofi_send_comm_t *send_comm); @@ -591,7 +591,7 @@ struct nccl_net_ofi_recv_comm { */ int (*deregMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_net_ofi_mr_handle_t *mhandle); - int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, int *tags, + int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req); int (*flush)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index 98690c8374..536c87ca7b 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -802,9 +802,6 @@ typedef struct nccl_net_ofi_rdma_device { * and its base struct. */ nccl_net_ofi_device_t base; - /* Message scheduler */ - nccl_net_ofi_scheduler_t *scheduler; - /* Number of rails */ uint16_t num_rails; @@ -850,6 +847,9 @@ typedef struct nccl_net_ofi_rdma_domain { /* List of endpoints and set of addresses they have connections to */ nccl_ofi_ep_addr_list_t *ep_addr_list; + + /* Message scheduler */ + nccl_net_ofi_scheduler_t *scheduler; } nccl_net_ofi_rdma_domain_t; diff --git a/include/tracing_impl/lttng.h b/include/tracing_impl/lttng.h index 893e2b060c..e026fe9d1c 100644 --- a/include/tracing_impl/lttng.h +++ b/include/tracing_impl/lttng.h @@ -52,7 +52,7 @@ LTTNG_UST_TRACEPOINT_EVENT( Send, LTTNG_UST_TP_ARGS( int, dev, - int, size, + size_t, size, void *, comm, uint16_t, msg_seq_num, void *, request, @@ -60,7 +60,7 @@ LTTNG_UST_TRACEPOINT_EVENT( ), LTTNG_UST_TP_FIELDS( lttng_ust_field_integer(int, dev, dev) - lttng_ust_field_integer(int, size, size) + lttng_ust_field_integer(size_t, size, size) lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm) lttng_ust_field_integer(uint16_t, msg_seq_num, msg_seq_num) lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request) @@ -238,14 +238,14 @@ LTTNG_UST_TRACEPOINT_EVENT( LTTNG_UST_TP_ARGS( int, dev, void *, comm, - int, size, + size_t, size, void *, request, void *, nccl_req ), LTTNG_UST_TP_FIELDS( lttng_ust_field_integer(int, dev, dev) lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm) - lttng_ust_field_integer(int, size, size) + lttng_ust_field_integer(size_t, size, size) lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request) lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req) ) diff --git a/src/nccl_ofi_api.cpp b/src/nccl_ofi_api.cpp index eaf1393152..9c459fb5a1 100644 --- a/src/nccl_ofi_api.cpp +++ b/src/nccl_ofi_api.cpp @@ -575,13 +575,19 @@ ncclResult_t nccl_net_ofi_isend_v2(void* sendComm, void* data, int size, } -ncclResult_t nccl_net_ofi_isend_v5(void *sComm, void* data, int size, - int tag, void *mhandle, void** req) +ncclResult_t nccl_net_ofi_isend_v5(void *sendComm, void* data, int size, + int tag, void *mhandle, void** request) +{ + return nccl_net_ofi_isend_v9(sendComm, data, static_cast(size), tag, mhandle, request); +} + +ncclResult_t nccl_net_ofi_isend_v9(void* sendComm, void* data, size_t size, + int tag, void* mhandle, void** request) { nccl_net_ofi_send_comm_t *send_comm = - (nccl_net_ofi_send_comm_t *)sComm; + (nccl_net_ofi_send_comm_t *)sendComm; nccl_net_ofi_mr_handle_t *handle = (nccl_net_ofi_mr_handle_t *)mhandle; - nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)req; + nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)request; /* Validate send_comm */ if (OFI_UNLIKELY(send_comm == NULL)) { @@ -604,35 +610,46 @@ ncclResult_t nccl_net_ofi_isend_v5(void *sComm, void* data, int size, return nccl_net_ofi_retval_translate(ret); } - -ncclResult_t nccl_net_ofi_isend_v9(void* sendComm, void* data, size_t size, - int tag, void* mhandle, void** request) +ncclResult_t nccl_net_ofi_irecv_v2(void* recvComm, void* data, int size, + void* mhandle, void** request) { - ncclResult_t validation_result = msg_length_verify_max_size(&size, 1); - if (validation_result != ncclSuccess) { - return check_return(validation_result); - } + int tag = 0; - return nccl_net_ofi_isend_v5(sendComm, data, (int)size, tag, mhandle, request); + return nccl_net_ofi_irecv_v5(recvComm, 1, &data, &size, &tag, &mhandle, request); } -ncclResult_t nccl_net_ofi_irecv_v2(void* recvComm, void* data, int size, - void* mhandle, void** request) +ncclResult_t nccl_net_ofi_irecv_v5(void* recvComm, int n, void** data, int* sizes, + int *tags, void** mhandles, void** request) { - int tag = 0; + size_t castedSizes[NCCL_OFI_MAX_RECVS] = {0}; + for (int i = 0; i < n; i++) { + castedSizes[i] = static_cast(sizes[i]); + } - return nccl_net_ofi_irecv_v5(recvComm, 1, &data, &size, &tag, &mhandle, request); + return nccl_net_ofi_irecv_v9(recvComm, n, data, castedSizes, tags, mhandles, request); } -ncclResult_t nccl_net_ofi_irecv_v5(void* rComm, int n, void** buffers, int* sizes, - int *tags, void** mhandles, void** req) +ncclResult_t nccl_net_ofi_irecv_v9(void* recvComm, int n, void** data, + size_t* sizes, int* tags, void** mhandles, void** request) { + if (OFI_UNLIKELY(recvComm == NULL || data == NULL || + sizes == NULL || tags == NULL || + mhandles == NULL || request == NULL)) { + NCCL_OFI_WARN("Invalid argument: NULL pointer detected"); + return check_return(ncclInvalidArgument); + } + + if (OFI_UNLIKELY(n <= 0 || n > NCCL_OFI_MAX_RECVS)) { + NCCL_OFI_WARN("Invalid number of receives: %d (max: %d)", n, NCCL_OFI_MAX_RECVS); + return check_return(ncclInvalidArgument); + } + nccl_net_ofi_recv_comm_t *recv_comm = - (nccl_net_ofi_recv_comm_t *)rComm; + (nccl_net_ofi_recv_comm_t *)recvComm; nccl_net_ofi_mr_handle_t **handles = (nccl_net_ofi_mr_handle_t **)mhandles; - nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)req; + nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)request; if (OFI_UNLIKELY(recv_comm == NULL)) { NCCL_OFI_WARN("Invalid communicator object provided"); @@ -661,40 +678,10 @@ ncclResult_t nccl_net_ofi_irecv_v5(void* rComm, int n, void** buffers, int* size return check_return(ncclInternalError); } - int ret = recv_comm->recv(recv_comm, n, buffers, sizes, tags, handles, base_req); + int ret = recv_comm->recv(recv_comm, n, data, sizes, tags, handles, base_req); return nccl_net_ofi_retval_translate(ret); } - -ncclResult_t nccl_net_ofi_irecv_v9(void* recvComm, int n, void** data, - size_t* sizes, int* tags, void** mhandles, void** request) -{ - if (OFI_UNLIKELY(recvComm == NULL || data == NULL || - sizes == NULL || tags == NULL || - mhandles == NULL || request == NULL)) { - NCCL_OFI_WARN("Invalid argument: NULL pointer detected"); - return check_return(ncclInvalidArgument); - } - - if (OFI_UNLIKELY(n <= 0 || n > NCCL_OFI_MAX_RECVS)) { - NCCL_OFI_WARN("Invalid number of receives: %d (max: %d)", n, NCCL_OFI_MAX_RECVS); - return check_return(ncclInvalidArgument); - } - - ncclResult_t validation_result = msg_length_verify_max_size(sizes, n); - if (validation_result != ncclSuccess) { - return check_return(validation_result); - } - - int sizesInt[NCCL_OFI_MAX_RECVS] = {0}; - for (int i = 0; i < n; i++) { - sizesInt[i] = (int)sizes[i]; - } - - return nccl_net_ofi_irecv_v5(recvComm, n, data, sizesInt, tags, mhandles, request); -} - - ncclResult_t nccl_net_ofi_test_v2(void* req, int* done, int* size) { /* Validate request */ diff --git a/src/nccl_ofi_rdma.cpp b/src/nccl_ofi_rdma.cpp index a9a5ae8e06..2b222acc9b 100644 --- a/src/nccl_ofi_rdma.cpp +++ b/src/nccl_ofi_rdma.cpp @@ -159,21 +159,6 @@ static nccl_net_ofi_rdma_plugin_t *rdma_device_get_plugin(nccl_net_ofi_rdma_devi return (nccl_net_ofi_rdma_plugin_t*)device->base.plugin; } - -static nccl_net_ofi_rdma_ep_t *rdma_req_get_ep(nccl_net_ofi_rdma_req_t *req) -{ - /* TODO: this function doesn't work for rx buffers, which have no - associated comm */ - assert(req->comm); - return (nccl_net_ofi_rdma_ep_t *)req->comm->ep; -} - - -static nccl_net_ofi_rdma_device_t *rdma_req_get_device(nccl_net_ofi_rdma_req_t *req) -{ - return (nccl_net_ofi_rdma_device_t *)rdma_req_get_ep(req)->base.domain->device; -} - /* * @brief Get endpoint communicator with given ID */ @@ -905,7 +890,9 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_ assert(ep != NULL); nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep); - nccl_net_ofi_scheduler_t *scheduler = device->scheduler; + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_scheduler_t *scheduler = domain->scheduler; rdma_req_send_data_t *send_data = get_send_data(req); rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(rx_buff_req); @@ -2200,8 +2187,11 @@ static inline int free_send_req(nccl_net_ofi_rdma_req_t *req, } if (send_data->schedule) { - nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req); - nccl_net_ofi_release_schedule(device->scheduler, send_data->schedule); + nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep; + assert(ep != NULL); + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_release_schedule(domain->scheduler, send_data->schedule); send_data->schedule = NULL; } @@ -2278,8 +2268,11 @@ static inline int free_send_ctrl_req(nccl_net_ofi_rdma_req_t *req, rdma_req_send_ctrl_data_t *send_ctrl_data = get_send_ctrl_data(req); if (send_ctrl_data->ctrl_schedule != NULL) { - nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req); - nccl_net_ofi_release_schedule(device->scheduler, send_ctrl_data->ctrl_schedule); + nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep; + assert(ep != NULL); + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_release_schedule(domain->scheduler, send_ctrl_data->ctrl_schedule); send_ctrl_data->ctrl_schedule = NULL; } @@ -2304,8 +2297,11 @@ static inline int free_send_close_req(nccl_net_ofi_rdma_req_t *req, rdma_req_send_close_data_t *send_close_data = req_get_send_close_data(req); if (send_close_data->ctrl_schedule) { - nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req); - nccl_net_ofi_release_schedule(device->scheduler, send_close_data->ctrl_schedule); + nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep; + assert(ep != NULL); + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_release_schedule(domain->scheduler, send_close_data->ctrl_schedule); send_close_data->ctrl_schedule = NULL; } @@ -3269,8 +3265,10 @@ static inline int insert_send_ctrl_req( nccl_net_ofi_rdma_req_t *recv_req, bool recv_completion_optional) { - nccl_net_ofi_scheduler_t *scheduler = device->scheduler; nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep; + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_scheduler_t *scheduler = domain->scheduler; nccl_net_ofi_rdma_req_t *send_ctrl_req = allocate_req(r_comm->nccl_ofi_reqs_fl); if (OFI_UNLIKELY(send_ctrl_req == NULL)) { NCCL_OFI_WARN("Unable to get NCCL OFI send control request for device %d", @@ -3524,7 +3522,7 @@ static int process_cq_if_pending(nccl_net_ofi_rdma_ep_t *ep) } static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, - int *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, + size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **base_req) { int ret = 0; @@ -5311,7 +5309,9 @@ static int alloc_rdma_send_req(nccl_net_ofi_rdma_send_comm_t *s_comm, { nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep; nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep); - nccl_net_ofi_scheduler_t *scheduler = device->scheduler; + nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep); + assert(domain != NULL); + nccl_net_ofi_scheduler_t *scheduler = domain->scheduler; *ret_req = NULL; /* Allocate NCCL OFI request */ @@ -5854,7 +5854,7 @@ static inline int check_post_rx_buff_req(nccl_net_ofi_rdma_req_t *rx_buff_req) * @brief Send a message. This "interface function" is called, indirectly, from * the application */ -static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag, +static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag, nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **base_req) { int ret = 0; @@ -7397,6 +7397,13 @@ nccl_net_ofi_rdma_domain_free(nccl_net_ofi_domain_t *base_domain) } free(domain->domain_rails); + if (domain->scheduler) { + ret = domain->scheduler->fini(domain->scheduler); + if (ret != 0) { + NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s", + strerror(-ret)); + } + } if (domain->ep_addr_list) { delete domain->ep_addr_list; domain->ep_addr_list = NULL; @@ -7497,6 +7504,13 @@ static nccl_net_ofi_domain_t *nccl_net_ofi_rdma_device_create_domain(nccl_net_of goto error; } + /* Create scheduler */ + ret = nccl_net_ofi_threshold_scheduler_init(domain->num_rails, &domain->scheduler); + if (ret != 0) { + goto error; + } + assert(domain->scheduler); + error: if (ret != 0) { domain->base.release(&(domain->base), false, false); @@ -7658,17 +7672,6 @@ nccl_net_ofi_rdma_device_release(nccl_net_ofi_device_t *base_device) free(device->device_rails); } - if (device->scheduler) { - ret = device->scheduler->fini(device->scheduler); - if (ret != 0) { - NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s", - strerror(-ret)); - if (first_error == 0) { - first_error = ret; - } - } - } - if (device->comms) { free(device->comms); device->comms = NULL; @@ -7786,13 +7789,6 @@ static nccl_net_ofi_rdma_device_t *nccl_net_ofi_rdma_device_create( NCCL_OFI_INFO(NCCL_NET, "Created device with %d rails", length); } - /* Create scheduler */ - ret = nccl_net_ofi_threshold_scheduler_init(length, &device->scheduler); - if (ret != 0) { - goto error; - } - assert(device->scheduler); - /* Set NIC information */ device->num_rails = length; device->device_rails = create_device_rail_array(info_list, length); diff --git a/src/nccl_ofi_sendrecv.cpp b/src/nccl_ofi_sendrecv.cpp index 311f073f2d..9032af3fe8 100644 --- a/src/nccl_ofi_sendrecv.cpp +++ b/src/nccl_ofi_sendrecv.cpp @@ -1068,7 +1068,7 @@ static inline nccl_net_ofi_sendrecv_req_t *sendrecv_allocate_req(nccl_ofi_freeli } static int sendrecv_recv_comm_recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, - int *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, + size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **base_req) { int ret = 0; @@ -1846,7 +1846,7 @@ static int sendrecv_send_comm_dereg_mr(nccl_net_ofi_send_comm_t *send_comm, domain->base.mr_cache); } -static int sendrecv_send_comm_send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag, +static int sendrecv_send_comm_send(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag, nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **base_req) { int ret = 0; diff --git a/topology/g5.48xl-topo.xml b/topology/g5.48xl-topo.xml index ff3586061e..930ce62022 100644 --- a/topology/g5.48xl-topo.xml +++ b/topology/g5.48xl-topo.xml @@ -21,5 +21,6 @@ virtual machine. +