Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 78 additions & 8 deletions .github/workflows/distcheck.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ env:
# environment variable is also now needed, as of july 2024.
# ref: https://github.com/actions/runner/issues/2906#issuecomment-2208546951
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: 'true'

# APT package list for caching
APT_CACHE_RESTORE_KEYS: |
apt-cache-ubuntu-${{ github.job }}

concurrency:
group: ${{ github.head_ref || github.run_id }}
Expand Down Expand Up @@ -190,19 +194,51 @@ jobs:
sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 10
fi

- name: Setup APT Cache for Distcheck
id: apt-cache-distcheck
uses: actions/cache@v3
with:
path: /var/cache/apt/archives
key: apt-cache-ubuntu-distcheck-${{ hashFiles('**/workflows/distcheck.yaml') }}
restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }}

- name: Install Base Dependencies
run: |
sudo apt-get update -y
# Download packages without installing if they're not in cache
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true
# Install from cache
sudo apt-get install -y ${{ env.APT_PACKAGES }}

- name: Setup CUDA APT Cache
id: cuda-apt-cache-distcheck
if: matrix.sdk == 'cuda'
uses: actions/cache@v3
with:
path: /var/cache/apt/archives
key: cuda-apt-cache-distcheck-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }}
restore-keys: |
cuda-apt-cache-ubuntu-

- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
run: |
sudo apt-get update -y && sudo apt-get install -y wget lsb-release
# Install with retry logic
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release)
repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
for i in {1..3}; do
if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then
break
fi
echo "Retrying wget in 10 seconds..."
sleep 10
done
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update -y
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
# Download packages without installing if not in cache
sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true
# Install from cache
sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6

- name: Install lttng
Expand Down Expand Up @@ -281,24 +317,58 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: '3.9'

- name: Setup APT Cache for CodeChecker
id: apt-cache-codechecker
uses: actions/cache@v3
with:
path: /var/cache/apt/archives
key: apt-cache-ubuntu-codechecker-${{ hashFiles('**/workflows/distcheck.yaml') }}
restore-keys: ${{ env.APT_CACHE_RESTORE_KEYS }}

- name: Install Base Dependencies
run: |
sudo apt-get update -y
# Download packages without installing if they're not in cache
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
sudo apt-get install -y --download-only ${{ env.APT_PACKAGES }} || true
# Install from cache
sudo apt-get install -y ${{ env.APT_PACKAGES }}

- name: Setup CUDA APT Cache
id: cuda-apt-cache-codechecker
if: matrix.sdk == 'cuda'
uses: actions/cache@v3
with:
path: /var/cache/apt/archives
key: cuda-apt-cache-codechecker-ubuntu-${{ hashFiles('**/workflows/distcheck.yaml') }}
restore-keys: |
cuda-apt-cache-ubuntu-

- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
run: |
sudo apt-get update -y && sudo apt-get install -y wget lsb-release
# Install with retry logic
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
sudo apt-get install -y wget lsb-release || (sleep 10 && sudo apt-get install -y wget lsb-release)
repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
for i in {1..3}; do
if wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb; then
break
fi
echo "Retrying wget in 10 seconds..."
sleep 10
done
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update -y
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
# Download packages without installing if not in cache
sudo apt-get install -y --download-only cuda-cudart-dev-12-6 cuda-crt-12-6 || true
# Install from cache
sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6

- name: Install cppcheck
run: |
sudo apt-get update -y || (sleep 10 && sudo apt-get update -y) || (sleep 30 && sudo apt-get update -y)
sudo apt-get install -y --download-only cppcheck || true
sudo apt-get install -y cppcheck

- name: Fetch and Install EFA Installer Dependencies
Expand Down
69 changes: 62 additions & 7 deletions .github/workflows/tag-makedist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ on:
- release/**
- v**

env:
# For caching APT packages
APT_CACHE_RESTORE_KEYS: |
apt-cache-ubuntu-makedist

jobs:
amazonlinux:
strategy:
Expand All @@ -29,27 +34,77 @@ jobs:
container: ${{ matrix.container }}
name: make dist for tag ${{ github.ref_name }}
steps:
- name: Setup YUM Cache
uses: actions/cache@v3
id: yum-cache
with:
path: /var/cache/yum
key: yum-cache-ubuntu-${{ hashFiles('**/workflows/tag-makedist.yaml') }}
restore-keys: |
yum-cache-ubuntu-

- run: |
yum -y update && yum -y install git tar util-linux findutils yum-utils
# Configure YUM to keep cache
echo 'keepcache=1' | tee -a /etc/yum.conf
# Add retry logic
for i in {1..3}; do
if yum -y update && yum -y install git tar util-linux findutils yum-utils; then
break
fi
echo "Retrying yum in 10 seconds..."
sleep 10
done
- uses: actions/checkout@v4
- name: Fetch and Install EFA Installer Dependencies
run: |
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz
# Add retry logic for curl
for i in {1..3}; do
if curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz; then
break
fi
echo "Retrying curl in 10 seconds..."
sleep 10
done
tar -xf aws-efa-installer-*.tar.gz
# Install EFA RPMs from local directory to avoid network issues
( cd aws-efa-installer/RPMS/${{ matrix.efainstallerdir }}/x86_64 ; find . | grep rpm$ | xargs yum -y localinstall )
rm -rf aws-efa-installer*

- name: Install hwloc, utilities.
run: |
yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make
# Add retry logic for package installation
for i in {1..3}; do
if yum -y install hwloc-devel autoconf automake libtool gcc gcc-c++ git make; then
break
fi
echo "Retrying yum in 10 seconds..."
sleep 10
done

- name: Install CUDA
run: |
${{ matrix.configmanager }} --add-repo \
http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
--save
# Add retry logic for repo config
for i in {1..3}; do
if ${{ matrix.configmanager }} --add-repo \
http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
--save; then
break
fi
echo "Retrying repo add in 10 seconds..."
sleep 10
done

# Clean and refresh cache
yum -y clean expire-cache
yum -y install ${{ matrix.cudapackages }}

# Add retry logic for package installation
for i in {1..3}; do
if yum -y install ${{ matrix.cudapackages }}; then
break
fi
echo "Retrying CUDA package installation in 10 seconds..."
sleep 10
done

- name: Call `autoreconf -ivf`
run: |
Expand Down
13 changes: 11 additions & 2 deletions contrib/scripts/generate_debian_changelog.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#!/bin/bash

fmt=" \
tagname=%(refname:short) \
tagger_name=%(taggername:mailmap) \
tagger_email=%(taggeremail:mailmap) \
tagger_when=%(taggerdate) \
committer_name=%(committername:mailmap) \
committer_email=%(committeremail:mailmap) \
committer_when=%(committerdate) \
"

if [ $# -eq 0 ]; then
tag=HEAD
else
Expand All @@ -12,8 +22,7 @@ if [ $? -ne 0 ]; then
exit 0
fi

git log --use-mailmap --no-walk --format="tagname='%D' tagger_name='%aN' tagger_email='<%aE>' tagger_when='%ad' committer_name='%cN' committer_email='<%cE>' committer_when='%cd'" \
--date="format:%a %b %-d %T %Y %z" $(git tag --merged "$tag") | sed "s/tagname='tag: \([^,']*\)'/tagname='\1'/" | {
git for-each-ref --shell --sort=-v:refname --format "$fmt" --merged "$tag" | {
while read line; do
eval $line
(echo ${tagname} | grep -qE '^v[0-9]') || continue
Expand Down
4 changes: 2 additions & 2 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ struct nccl_net_ofi_send_comm {
*/
int (*deregMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_net_ofi_mr_handle_t *mhandle);

int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag,
int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag,
nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **req);

int (*close)(nccl_net_ofi_send_comm_t *send_comm);
Expand Down Expand Up @@ -591,7 +591,7 @@ struct nccl_net_ofi_recv_comm {
*/
int (*deregMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_net_ofi_mr_handle_t *mhandle);

int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, int *tags,
int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, size_t *sizes, int *tags,
nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req);

int (*flush)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes,
Expand Down
6 changes: 3 additions & 3 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -802,9 +802,6 @@ typedef struct nccl_net_ofi_rdma_device {
* and its base struct. */
nccl_net_ofi_device_t base;

/* Message scheduler */
nccl_net_ofi_scheduler_t *scheduler;

/* Number of rails */
uint16_t num_rails;

Expand Down Expand Up @@ -850,6 +847,9 @@ typedef struct nccl_net_ofi_rdma_domain {

/* List of endpoints and set of addresses they have connections to */
nccl_ofi_ep_addr_list_t *ep_addr_list;

/* Message scheduler */
nccl_net_ofi_scheduler_t *scheduler;
} nccl_net_ofi_rdma_domain_t;


Expand Down
8 changes: 4 additions & 4 deletions include/tracing_impl/lttng.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ LTTNG_UST_TRACEPOINT_EVENT(
Send,
LTTNG_UST_TP_ARGS(
int, dev,
int, size,
size_t, size,
void *, comm,
uint16_t, msg_seq_num,
void *, request,
void *, nccl_req
),
LTTNG_UST_TP_FIELDS(
lttng_ust_field_integer(int, dev, dev)
lttng_ust_field_integer(int, size, size)
lttng_ust_field_integer(size_t, size, size)
lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm)
lttng_ust_field_integer(uint16_t, msg_seq_num, msg_seq_num)
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
Expand Down Expand Up @@ -238,14 +238,14 @@ LTTNG_UST_TRACEPOINT_EVENT(
LTTNG_UST_TP_ARGS(
int, dev,
void *, comm,
int, size,
size_t, size,
void *, request,
void *, nccl_req
),
LTTNG_UST_TP_FIELDS(
lttng_ust_field_integer(int, dev, dev)
lttng_ust_field_integer_hex(uint64_t, comm, (uint64_t)comm)
lttng_ust_field_integer(int, size, size)
lttng_ust_field_integer(size_t, size, size)
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
Expand Down
Loading
Loading