diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/distcheck.yaml
index 62f34ca83a..30e0da3f29 100644
--- a/.github/workflows/distcheck.yaml
+++ b/.github/workflows/distcheck.yaml
@@ -38,6 +38,10 @@ env:
   # environment variable is also now needed, as of july 2024.
   # ref: https://github.com/actions/runner/issues/2906#issuecomment-2208546951
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: 'true'
+  
+  # APT package list for caching
+  APT_CACHE_RESTORE_KEYS: |
+    apt-cache-ubuntu-${{ github.job }}
 
 concurrency:
   group: ${{ github.head_ref || github.run_id }}
@@ -190,19 +194,51 @@ jobs:
             sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 10
           fi
 
+      - name: Setup APT Cache for Distcheck
+        id: apt-cache-distcheck
+        uses: actions/cache@v3
+        with:
+          path: /var/cache/apt/archives
+          key: apt-cache-ubuntu-distcheck-${{ hashFiles('**/workflows/distcheck.yaml') }}
+          restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }}
+
       - name: Install Base Dependencies
         run: |
-          sudo apt-get update -y
+          # Download packages without installing if they're not in cache
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true
+          # Install from cache
           sudo apt-get install -y ${{ env.APT_PACKAGES }}
 
+      - name: Setup CUDA APT Cache
+        id: cuda-apt-cache-distcheck
+        if: matrix.sdk == 'cuda'
+        uses: actions/cache@v3
+        with:
+          path: /var/cache/apt/archives
+          key: cuda-apt-cache-distcheck-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }}
+          restore-keys: |
+            cuda-apt-cache-ubuntu-
+      
       - name: Install CUDA SDK
         if: matrix.sdk == 'cuda'
         run: |
-          sudo apt-get update -y && sudo apt-get install -y wget lsb-release
+          # Install with retry logic
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release)
           repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
-          wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
+          for i in {1..3}; do
+            if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then
+              break
+            fi
+            echo "Retrying wget in 10 seconds..."
+            sleep 10
+          done
           sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update -y
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          # Download packages without installing if not in cache
+          sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true
+          # Install from cache
           sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6
 
       - name: Install lttng
@@ -281,24 +317,58 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: '3.9'
+          
+      - name: Setup APT Cache for CodeChecker
+        id: apt-cache-codechecker
+        uses: actions/cache@v3
+        with:
+          path: /var/cache/apt/archives
+          key: apt-cache-ubuntu-codechecker-${{ hashFiles('**/workflows/distcheck.yaml') }}
+          restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }}
 
       - name: Install Base Dependencies
         run: |
-          sudo apt-get update -y
+          # Download packages without installing if they're not in cache
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true
+          # Install from cache
           sudo apt-get install -y ${{ env.APT_PACKAGES }}
 
+      - name: Setup CUDA APT Cache
+        id: cuda-apt-cache-codechecker
+        if: matrix.sdk == 'cuda'
+        uses: actions/cache@v3
+        with:
+          path: /var/cache/apt/archives
+          key: cuda-apt-cache-codechecker-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }}
+          restore-keys: |
+            cuda-apt-cache-ubuntu-
+      
       - name: Install CUDA SDK
         if: matrix.sdk == 'cuda'
         run: |
-          sudo apt-get update -y && sudo apt-get install -y wget lsb-release
+          # Install with retry logic
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release)
           repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
-          wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
+          for i in {1..3}; do
+            if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then
+              break
+            fi
+            echo "Retrying wget in 10 seconds..."
+            sleep 10
+          done
           sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update -y
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          # Download packages without installing if not in cache
+          sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true
+          # Install from cache
           sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6
 
       - name: Install cppcheck
         run: |
+          sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
+          sudo apt-get install -y --download-only cppcheck || true
           sudo apt-get install -y cppcheck
 
       - name: Fetch and Install EFA Installer Dependencies
diff --git a/.github/workflows/tag-makedist.yaml b/.github/workflows/tag-makedist.yaml
index 43dbf6c33c..c9430df4f3 100644
--- a/.github/workflows/tag-makedist.yaml
+++ b/.github/workflows/tag-makedist.yaml
@@ -6,6 +6,11 @@ on:
       - release/**
       - v**
 
+env:
+  # For caching APT packages
+  APT_CACHE_RESTORE_KEYS: |
+    apt-cache-ubuntu-makedist
+
 jobs:
   amazonlinux:
     strategy:
@@ -29,27 +34,77 @@ jobs:
     container: ${{ matrix.container }}
     name: make dist for tag ${{ github.ref_name }}
     steps:
+      - name: Setup YUM Cache
+        uses: actions/cache@v3
+        id: yum-cache
+        with:
+          path: /var/cache/yum
+          key: yum-cache-ubuntu-${{ hashFiles('**/workflows/tag-makedist.yaml') }}
+          restore-keys: |
+            yum-cache-ubuntu-
+      
       - run: |
-          yum -y update && yum -y install git tar util-linux findutils yum-utils
+          # Configure YUM to keep cache
+          echo 'keepcache=1' | tee -a /etc/yum.conf
+          # Add retry logic
+          for i in {1..3}; do
+            if yum -y update && yum -y install git tar util-linux findutils yum-utils; then
+              break
+            fi
+            echo "Retrying yum in 10 seconds..."
+            sleep 10
+          done
       - uses: actions/checkout@v4
       - name: Fetch and Install EFA Installer Dependencies
         run: |
-          curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz
+          # Add retry logic for curl
+          for i in {1..3}; do
+            if curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz; then
+              break
+            fi
+            echo "Retrying curl in 10 seconds..."
+            sleep 10
+          done
           tar -xf aws-efa-installer-*.tar.gz
+          # Install EFA RPMs from local directory to avoid network issues
           ( cd aws-efa-installer/RPMS/${{ matrix.efainstallerdir }}/x86_64 ; find . | grep rpm$ | xargs yum -y localinstall )
           rm -rf aws-efa-installer*
 
       - name: Install hwloc, utilities.
         run: |
-          yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make
+          # Add retry logic for package installation
+          for i in {1..3}; do
+            if yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make; then
+              break
+            fi
+            echo "Retrying yum in 10 seconds..."
+            sleep 10
+          done
 
       - name: Install CUDA
         run: |
-          ${{ matrix.configmanager }} --add-repo \
-             http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
-             --save
+          # Add retry logic for repo config
+          for i in {1..3}; do
+            if ${{ matrix.configmanager }} --add-repo \
+               http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
+               --save; then
+              break
+            fi
+            echo "Retrying repo add in 10 seconds..."
+            sleep 10
+          done
+          
+          # Clean and refresh cache
           yum -y clean expire-cache
-          yum -y install ${{ matrix.cudapackages }}
+          
+          # Add retry logic for package installation
+          for i in {1..3}; do
+            if yum -y install ${{ matrix.cudapackages }}; then
+              break
+            fi
+            echo "Retrying CUDA package installation in 10 seconds..."
+            sleep 10
+          done
 
       - name: Call `autoreconf -ivf`
         run: |
diff --git a/contrib/scripts/generate_debian_changelog.sh b/contrib/scripts/generate_debian_changelog.sh
index ea46069964..22049e3311 100755
--- a/contrib/scripts/generate_debian_changelog.sh
+++ b/contrib/scripts/generate_debian_changelog.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
 
+fmt=" \
+tagname=%(refname:short) \
+tagger_name=%(taggername:mailmap) \
+tagger_email=%(taggeremail:mailmap) \
+tagger_when=%(taggerdate) \
+committer_name=%(committername:mailmap) \
+committer_email=%(committeremail:mailmap) \
+committer_when=%(committerdate) \
+"
+
 if [ $# -eq 0 ]; then
 	tag=HEAD
 else
@@ -12,8 +22,7 @@ if [ $? -ne 0 ]; then
 	exit 0
 fi
 
-git log --use-mailmap --no-walk --format="tagname='%D' tagger_name='%aN' tagger_email='<%aE>' tagger_when='%ad' committer_name='%cN' committer_email='<%cE>' committer_when='%cd'" \
-    --date="format:%a %b %-d %T %Y %z" $(git tag --merged "$tag") | sed "s/tagname='tag: \([^,']*\)'/tagname='\1'/" | {
+git for-each-ref --shell --sort=-v:refname --format "$fmt" --merged "$tag" | {
 	while read line; do
 		eval $line
 		(echo ${tagname} | grep -qE '^v[0-9]') || continue
diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h
index ebdcd6f38a..b01759d0ed 100644
--- a/include/nccl_ofi.h
+++ b/include/nccl_ofi.h
@@ -558,7 +558,7 @@ struct nccl_net_ofi_send_comm {
 	 */
 	int (*deregMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_net_ofi_mr_handle_t *mhandle);
 
-	int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag,
+	int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag,
 			     nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **req);
 
 	int (*close)(nccl_net_ofi_send_comm_t *send_comm);
@@ -591,7 +591,7 @@ struct nccl_net_ofi_recv_comm {
 	 */
 	int (*deregMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_net_ofi_mr_handle_t *mhandle);
 
-	int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, int *tags,
+	int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, size_t *sizes, int *tags,
 			     nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req);
 
 	int (*flush)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes,
diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h
index 98690c8374..536c87ca7b 100644
--- a/include/nccl_ofi_rdma.h
+++ b/include/nccl_ofi_rdma.h
@@ -802,9 +802,6 @@ typedef struct nccl_net_ofi_rdma_device {
 	 * and its base struct. */
 	nccl_net_ofi_device_t base;
 
-	/* Message scheduler */
-	nccl_net_ofi_scheduler_t *scheduler;
-
 	/* Number of rails */
 	uint16_t num_rails;
 
@@ -850,6 +847,9 @@ typedef struct nccl_net_ofi_rdma_domain {
 
 	/* List of endpoints and set of addresses they have connections to */
 	nccl_ofi_ep_addr_list_t *ep_addr_list;
+
+	/* Message scheduler */
+	nccl_net_ofi_scheduler_t *scheduler;
 } nccl_net_ofi_rdma_domain_t;
 
 
diff --git a/include/tracing_impl/lttng.h b/include/tracing_impl/lttng.h
index 893e2b060c..e026fe9d1c 100644
--- a/include/tracing_impl/lttng.h
+++ b/include/tracing_impl/lttng.h
@@ -52,7 +52,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
     Send,
     LTTNG_UST_TP_ARGS(
             int, dev,
-            int, size,
+            size_t, size,
             void *, comm,
             uint16_t, msg_seq_num,
             void *, request,
@@ -60,7 +60,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
     ),
     LTTNG_UST_TP_FIELDS(
             lttng_ust_field_integer(int, dev, dev)
-            lttng_ust_field_integer(int, size, size)
+            lttng_ust_field_integer(size_t, size, size)
             lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm)
             lttng_ust_field_integer(uint16_t, msg_seq_num, msg_seq_num)
             lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
@@ -238,14 +238,14 @@ LTTNG_UST_TRACEPOINT_EVENT(
     LTTNG_UST_TP_ARGS(
             int, dev,
             void *, comm,
-            int, size,
+            size_t, size,
             void *, request,
             void *, nccl_req
     ),
     LTTNG_UST_TP_FIELDS(
             lttng_ust_field_integer(int, dev, dev)
             lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm)
-            lttng_ust_field_integer(int, size, size)
+            lttng_ust_field_integer(size_t, size, size)
             lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
             lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
     )
diff --git a/src/nccl_ofi_api.cpp b/src/nccl_ofi_api.cpp
index eaf1393152..9c459fb5a1 100644
--- a/src/nccl_ofi_api.cpp
+++ b/src/nccl_ofi_api.cpp
@@ -575,13 +575,19 @@ ncclResult_t nccl_net_ofi_isend_v2(void* sendComm, void* data, int size,
 }
 
 
-ncclResult_t nccl_net_ofi_isend_v5(void *sComm, void* data, int size,
-				   int tag, void *mhandle, void** req)
+ncclResult_t nccl_net_ofi_isend_v5(void *sendComm, void* data, int size,
+				   int tag, void *mhandle, void** request)
+{
+	return nccl_net_ofi_isend_v9(sendComm, data, static_cast<size_t>(size), tag, mhandle, request);
+}
+
+ncclResult_t nccl_net_ofi_isend_v9(void* sendComm, void* data, size_t size,
+				   int tag, void* mhandle, void** request)
 {
 	nccl_net_ofi_send_comm_t *send_comm =
-		(nccl_net_ofi_send_comm_t *)sComm;
+		(nccl_net_ofi_send_comm_t *)sendComm;
 	nccl_net_ofi_mr_handle_t *handle = (nccl_net_ofi_mr_handle_t *)mhandle;
-	nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)req;
+	nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)request;
 
 	/* Validate send_comm */
 	if (OFI_UNLIKELY(send_comm == NULL)) {
@@ -604,35 +610,46 @@ ncclResult_t nccl_net_ofi_isend_v5(void *sComm, void* data, int size,
 	return nccl_net_ofi_retval_translate(ret);
 }
 
-
-ncclResult_t nccl_net_ofi_isend_v9(void* sendComm, void* data, size_t size,
-				   int tag, void* mhandle, void** request)
+ncclResult_t nccl_net_ofi_irecv_v2(void* recvComm, void* data, int size,
+				   void* mhandle, void** request)
 {
-	ncclResult_t validation_result = msg_length_verify_max_size(&size, 1);
-	if (validation_result != ncclSuccess) {
-		return check_return(validation_result);
-	}
+	int tag = 0;
 
-	return nccl_net_ofi_isend_v5(sendComm, data, (int)size, tag, mhandle, request);
+	return nccl_net_ofi_irecv_v5(recvComm, 1, &data, &size, &tag, &mhandle, request);
 }
 
 
-ncclResult_t nccl_net_ofi_irecv_v2(void* recvComm, void* data, int size,
-				   void* mhandle, void** request)
+ncclResult_t nccl_net_ofi_irecv_v5(void* recvComm, int n, void** data, int* sizes,
+				   int *tags, void** mhandles, void** request)
 {
-	int tag = 0;
+	size_t castedSizes[NCCL_OFI_MAX_RECVS] = {0};
+	for (int i = 0; i < n; i++) {
+		castedSizes[i] = static_cast<size_t>(sizes[i]);
+	}
 
-	return nccl_net_ofi_irecv_v5(recvComm, 1, &data, &size, &tag, &mhandle, request);
+	return nccl_net_ofi_irecv_v9(recvComm, n, data, castedSizes, tags, mhandles, request);
 }
 
 
-ncclResult_t nccl_net_ofi_irecv_v5(void* rComm, int n, void** buffers, int* sizes,
-				   int *tags, void** mhandles, void** req)
+ncclResult_t nccl_net_ofi_irecv_v9(void* recvComm, int n, void** data,
+				   size_t* sizes, int* tags, void** mhandles, void** request)
 {
+	if (OFI_UNLIKELY(recvComm == NULL || data == NULL ||
+				  sizes == NULL || tags == NULL ||
+				  mhandles == NULL || request == NULL)) {
+		NCCL_OFI_WARN("Invalid argument: NULL pointer detected");
+		return check_return(ncclInvalidArgument);
+	}
+
+	if (OFI_UNLIKELY(n <= 0 || n > NCCL_OFI_MAX_RECVS)) {
+		NCCL_OFI_WARN("Invalid number of receives: %d (max: %d)", n, NCCL_OFI_MAX_RECVS);
+		return check_return(ncclInvalidArgument);
+	}
+
 	nccl_net_ofi_recv_comm_t *recv_comm =
-		(nccl_net_ofi_recv_comm_t *)rComm;
+		(nccl_net_ofi_recv_comm_t *)recvComm;
 	nccl_net_ofi_mr_handle_t **handles = (nccl_net_ofi_mr_handle_t **)mhandles;
-	nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)req;
+	nccl_net_ofi_req_t **base_req = (nccl_net_ofi_req_t **)request;
 
 	if (OFI_UNLIKELY(recv_comm == NULL)) {
 		NCCL_OFI_WARN("Invalid communicator object provided");
@@ -661,40 +678,10 @@ ncclResult_t nccl_net_ofi_irecv_v5(void* rComm, int n, void** buffers, int* size
 		return check_return(ncclInternalError);
 	}
 
-	int ret = recv_comm->recv(recv_comm, n, buffers, sizes, tags, handles, base_req);
+	int ret = recv_comm->recv(recv_comm, n, data, sizes, tags, handles, base_req);
 	return nccl_net_ofi_retval_translate(ret);
 }
 
-
-ncclResult_t nccl_net_ofi_irecv_v9(void* recvComm, int n, void** data,
-				   size_t* sizes, int* tags, void** mhandles, void** request)
-{
-	if (OFI_UNLIKELY(recvComm == NULL || data == NULL ||
-					sizes == NULL || tags == NULL ||
-					mhandles == NULL || request == NULL)) {
-		NCCL_OFI_WARN("Invalid argument: NULL pointer detected");
-		return check_return(ncclInvalidArgument);
-	}
-
-	if (OFI_UNLIKELY(n <= 0 || n > NCCL_OFI_MAX_RECVS)) {
-		NCCL_OFI_WARN("Invalid number of receives: %d (max: %d)", n, NCCL_OFI_MAX_RECVS);
-		return check_return(ncclInvalidArgument);
-	}
-
-	ncclResult_t validation_result = msg_length_verify_max_size(sizes, n);
-	if (validation_result != ncclSuccess) {
-		return check_return(validation_result);
-	}
-
-	int sizesInt[NCCL_OFI_MAX_RECVS] = {0};
-	for (int i = 0; i < n; i++) {
-		sizesInt[i] = (int)sizes[i];
-	}
-
-	return nccl_net_ofi_irecv_v5(recvComm, n, data, sizesInt, tags, mhandles, request);
-}
-
-
 ncclResult_t nccl_net_ofi_test_v2(void* req, int* done, int* size)
 {
 	/* Validate request */
diff --git a/src/nccl_ofi_rdma.cpp b/src/nccl_ofi_rdma.cpp
index a9a5ae8e06..2b222acc9b 100644
--- a/src/nccl_ofi_rdma.cpp
+++ b/src/nccl_ofi_rdma.cpp
@@ -159,21 +159,6 @@ static nccl_net_ofi_rdma_plugin_t *rdma_device_get_plugin(nccl_net_ofi_rdma_devi
 	return (nccl_net_ofi_rdma_plugin_t*)device->base.plugin;
 }
 
-
-static nccl_net_ofi_rdma_ep_t *rdma_req_get_ep(nccl_net_ofi_rdma_req_t *req)
-{
-	/* TODO: this function doesn't work for rx buffers, which have no
-	   associated comm */
-	assert(req->comm);
-	return (nccl_net_ofi_rdma_ep_t *)req->comm->ep;
-}
-
-
-static nccl_net_ofi_rdma_device_t *rdma_req_get_device(nccl_net_ofi_rdma_req_t *req)
-{
-	return (nccl_net_ofi_rdma_device_t *)rdma_req_get_ep(req)->base.domain->device;
-}
-
 /*
  * @brief	Get endpoint communicator with given ID
  */
@@ -905,7 +890,9 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_
 	assert(ep != NULL);
 
 	nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep);
-	nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
+	nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+	assert(domain != NULL);
+	nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
 
 	rdma_req_send_data_t *send_data = get_send_data(req);
 	rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(rx_buff_req);
@@ -2200,8 +2187,11 @@ static inline int free_send_req(nccl_net_ofi_rdma_req_t *req,
 	}
 
 	if (send_data->schedule) {
-		nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
-		nccl_net_ofi_release_schedule(device->scheduler, send_data->schedule);
+		nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep;
+		assert(ep != NULL);
+		nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+		assert(domain != NULL);
+		nccl_net_ofi_release_schedule(domain->scheduler, send_data->schedule);
 		send_data->schedule = NULL;
 	}
 
@@ -2278,8 +2268,11 @@ static inline int free_send_ctrl_req(nccl_net_ofi_rdma_req_t *req,
 	rdma_req_send_ctrl_data_t *send_ctrl_data = get_send_ctrl_data(req);
 
 	if (send_ctrl_data->ctrl_schedule != NULL) {
-		nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
-		nccl_net_ofi_release_schedule(device->scheduler, send_ctrl_data->ctrl_schedule);
+		nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
+		assert(ep != NULL);
+		nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+		assert(domain != NULL);
+		nccl_net_ofi_release_schedule(domain->scheduler, send_ctrl_data->ctrl_schedule);
 		send_ctrl_data->ctrl_schedule = NULL;
 	}
 
@@ -2304,8 +2297,11 @@ static inline int free_send_close_req(nccl_net_ofi_rdma_req_t *req,
 	rdma_req_send_close_data_t *send_close_data = req_get_send_close_data(req);
 
 	if (send_close_data->ctrl_schedule) {
-		nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
-		nccl_net_ofi_release_schedule(device->scheduler, send_close_data->ctrl_schedule);
+		nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
+		assert(ep != NULL);
+		nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+		assert(domain != NULL);
+		nccl_net_ofi_release_schedule(domain->scheduler, send_close_data->ctrl_schedule);
 		send_close_data->ctrl_schedule = NULL;
 	}
 
@@ -3269,8 +3265,10 @@ static inline int insert_send_ctrl_req(
 				nccl_net_ofi_rdma_req_t *recv_req,
 				bool recv_completion_optional)
 {
-	nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
 	nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
+	nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+	assert(domain != NULL);
+	nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
 	nccl_net_ofi_rdma_req_t *send_ctrl_req = allocate_req(r_comm->nccl_ofi_reqs_fl);
 	if (OFI_UNLIKELY(send_ctrl_req == NULL)) {
 		NCCL_OFI_WARN("Unable to get NCCL OFI send control request for device %d",
@@ -3524,7 +3522,7 @@ static int process_cq_if_pending(nccl_net_ofi_rdma_ep_t *ep)
 }
 
 static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
-			 int *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles,
+			 size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles,
 			 nccl_net_ofi_req_t **base_req)
 {
 	int ret = 0;
@@ -5311,7 +5309,9 @@ static int alloc_rdma_send_req(nccl_net_ofi_rdma_send_comm_t *s_comm,
 {
 	nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep;
 	nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep);
-	nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
+	nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
+	assert(domain != NULL);
+	nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
 	*ret_req = NULL;
 
 	/* Allocate NCCL OFI request */
@@ -5854,7 +5854,7 @@ static inline int check_post_rx_buff_req(nccl_net_ofi_rdma_req_t *rx_buff_req)
  * @brief	Send a message. This "interface function" is called, indirectly, from
  *       	the application
  */
-static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag,
+static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag,
 			 nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **base_req)
 {
 	int ret = 0;
@@ -7397,6 +7397,13 @@ nccl_net_ofi_rdma_domain_free(nccl_net_ofi_domain_t *base_domain)
 	}
 	free(domain->domain_rails);
 
+	if (domain->scheduler) {
+		ret = domain->scheduler->fini(domain->scheduler);
+		if (ret != 0) {
+			NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s",
+					strerror(-ret));
+		}
+	}
 	if (domain->ep_addr_list) {
 		delete domain->ep_addr_list;
 		domain->ep_addr_list = NULL;
@@ -7497,6 +7504,13 @@ static nccl_net_ofi_domain_t *nccl_net_ofi_rdma_device_create_domain(nccl_net_of
 		goto error;
 	}
 
+	/* Create scheduler */
+	ret = nccl_net_ofi_threshold_scheduler_init(domain->num_rails, &domain->scheduler);
+	if (ret != 0) {
+		goto error;
+	}
+	assert(domain->scheduler);
+
 error:
 	if (ret != 0) {
 		domain->base.release(&(domain->base), false, false);
@@ -7658,17 +7672,6 @@ nccl_net_ofi_rdma_device_release(nccl_net_ofi_device_t *base_device)
 		free(device->device_rails);
 	}
 
-	if (device->scheduler) {
-		ret = device->scheduler->fini(device->scheduler);
-		if (ret != 0) {
-			NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s",
-				      strerror(-ret));
-			if (first_error == 0) {
-				first_error = ret;
-			}
-		}
-	}
-
 	if (device->comms) {
 		free(device->comms);
 		device->comms = NULL;
@@ -7786,13 +7789,6 @@ static nccl_net_ofi_rdma_device_t *nccl_net_ofi_rdma_device_create(
 		NCCL_OFI_INFO(NCCL_NET, "Created device with %d rails", length);
 	}
 
-	/* Create scheduler */
-	ret = nccl_net_ofi_threshold_scheduler_init(length, &device->scheduler);
-	if (ret != 0) {
-		goto error;
-	}
-	assert(device->scheduler);
-
 	/* Set NIC information */
 	device->num_rails = length;
 	device->device_rails = create_device_rail_array(info_list, length);
diff --git a/src/nccl_ofi_sendrecv.cpp b/src/nccl_ofi_sendrecv.cpp
index 311f073f2d..9032af3fe8 100644
--- a/src/nccl_ofi_sendrecv.cpp
+++ b/src/nccl_ofi_sendrecv.cpp
@@ -1068,7 +1068,7 @@ static inline nccl_net_ofi_sendrecv_req_t *sendrecv_allocate_req(nccl_ofi_freeli
 }
 
 static int sendrecv_recv_comm_recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
-				   int *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles,
+				   size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles,
 				   nccl_net_ofi_req_t **base_req)
 {
 	int ret = 0;
@@ -1846,7 +1846,7 @@ static int sendrecv_send_comm_dereg_mr(nccl_net_ofi_send_comm_t *send_comm,
 				  domain->base.mr_cache);
 }
 
-static int sendrecv_send_comm_send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag,
+static int sendrecv_send_comm_send(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag,
 				   nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **base_req)
 {
 	int ret = 0;
diff --git a/topology/g5.48xl-topo.xml b/topology/g5.48xl-topo.xml
index ff3586061e..930ce62022 100644
--- a/topology/g5.48xl-topo.xml
+++ b/topology/g5.48xl-topo.xml
@@ -21,5 +21,6 @@ virtual machine.
     <pci busid="0000:00:1b.0"/>
     <pci busid="0000:00:1c.0"/>
     <pci busid="0000:00:1d.0"/>
+    <pci busid="0000:00:15.0"/>
   </cpu>
 </system>