Skip to content

Commit be6a941

Browse files
Binyang2014caiomcbrchhwang
authored
New DSL implementation (#579)
The PR contains following changes: Python side: - Channel based DSL implementation: decouple channel with chunk. - Users create channel explicitly, only need local_rank, remote_rank and channel_type - Adjust executor json file, add remote_buffer fields, different op can use different channel and remote buffers combination. - Reimplement operation fusion, data dependency check mechanism - Add new op such as semaphore, pipeline - Clean code and enhance document C++ side: - Support new execution file json format - Support semaphore and pipeline operation - code clean, support non-zero copy scenario --------- Co-authored-by: Caio Rocha <[email protected]> Co-authored-by: Changho Hwang <[email protected]>
1 parent 1cc1b82 commit be6a941

File tree

109 files changed

+10144
-7190
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+10144
-7190
lines changed

.azure-pipelines/templates/nccl-test.yaml

Lines changed: 85 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,24 @@ steps:
7979
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
8080
workingDirectory: '$(System.DefaultWorkingDirectory)'
8181

82-
- task: Bash@3
83-
name: GenerateExecutionFile
84-
displayName: Generate execution file
85-
inputs:
86-
targetType: 'inline'
87-
script: |
88-
set -e
89-
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
90-
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
91-
SSH_OPTION="StrictHostKeyChecking=no"
92-
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
93-
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
94-
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
95-
cd /root/mscclpp/msccl-users; \
96-
mkdir -p execution-files; \
97-
cd /root/mscclpp/msccl-users; \
98-
bash algos/mscclpp_a100/generate_execution_plan.sh"'
99-
workingDirectory: '$(System.DefaultWorkingDirectory)'
82+
# - task: Bash@3
83+
# name: GenerateExecutionFile
84+
# displayName: Generate execution file
85+
# inputs:
86+
# targetType: 'inline'
87+
# script: |
88+
# set -e
89+
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
90+
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
91+
# SSH_OPTION="StrictHostKeyChecking=no"
92+
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
93+
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
94+
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
95+
# cd /root/mscclpp/msccl-users; \
96+
# mkdir -p execution-files; \
97+
# cd /root/mscclpp/msccl-users; \
98+
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
99+
# workingDirectory: '$(System.DefaultWorkingDirectory)'
100100

101101
- task: Bash@3
102102
name: InstallNcclTests
@@ -116,56 +116,56 @@ steps:
116116
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
117117
workingDirectory: '$(System.DefaultWorkingDirectory)'
118118

119-
- task: Bash@3
120-
name: RunNcclAllReduceTest
121-
displayName: Run NCCL AllReduce Test
122-
inputs:
123-
targetType: inline
124-
script: |
125-
set -e
126-
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
127-
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
128-
SSH_OPTION="StrictHostKeyChecking=no"
129-
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
130-
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
131-
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
132-
cd /root/mscclpp; \
133-
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
134-
workingDirectory: '$(System.DefaultWorkingDirectory)'
119+
# - task: Bash@3
120+
# name: RunNcclAllReduceTest
121+
# displayName: Run NCCL AllReduce Test
122+
# inputs:
123+
# targetType: inline
124+
# script: |
125+
# set -e
126+
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
127+
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
128+
# SSH_OPTION="StrictHostKeyChecking=no"
129+
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
130+
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
131+
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
132+
# cd /root/mscclpp; \
133+
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
134+
# workingDirectory: '$(System.DefaultWorkingDirectory)'
135135

136-
- task: Bash@3
137-
name: RunNcclAllGatherTest
138-
displayName: Run NCCL AllGather Test
139-
inputs:
140-
targetType: inline
141-
script: |
142-
set -e
143-
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
144-
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
145-
SSH_OPTION="StrictHostKeyChecking=no"
146-
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
147-
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
148-
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
149-
cd /root/mscclpp; \
150-
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
151-
workingDirectory: '$(System.DefaultWorkingDirectory)'
136+
# - task: Bash@3
137+
# name: RunNcclAllGatherTest
138+
# displayName: Run NCCL AllGather Test
139+
# inputs:
140+
# targetType: inline
141+
# script: |
142+
# set -e
143+
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
144+
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
145+
# SSH_OPTION="StrictHostKeyChecking=no"
146+
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
147+
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
148+
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
149+
# cd /root/mscclpp; \
150+
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
151+
# workingDirectory: '$(System.DefaultWorkingDirectory)'
152152

153-
- task: Bash@3
154-
name: RunNcclReduceScatterTest
155-
displayName: Run NCCL Reduce Scatter Test
156-
inputs:
157-
targetType: inline
158-
script: |
159-
set -e
160-
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
161-
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
162-
SSH_OPTION="StrictHostKeyChecking=no"
163-
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
164-
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
165-
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
166-
cd /root/mscclpp; \
167-
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
168-
workingDirectory: '$(System.DefaultWorkingDirectory)'
153+
# - task: Bash@3
154+
# name: RunNcclReduceScatterTest
155+
# displayName: Run NCCL Reduce Scatter Test
156+
# inputs:
157+
# targetType: inline
158+
# script: |
159+
# set -e
160+
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
161+
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
162+
# SSH_OPTION="StrictHostKeyChecking=no"
163+
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
164+
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
165+
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
166+
# cd /root/mscclpp; \
167+
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
168+
# workingDirectory: '$(System.DefaultWorkingDirectory)'
169169

170170
- task: Bash@3
171171
name: InstallNccl
@@ -245,25 +245,25 @@ steps:
245245
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
246246
workingDirectory: '$(System.DefaultWorkingDirectory)'
247247

248-
- task: Bash@3
249-
name: RunNcclReduceScatterFallbaclkToNcclTest
250-
displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
251-
inputs:
252-
targetType: 'inline'
253-
script: |
254-
set -e
255-
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
256-
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
257-
SSH_OPTION="StrictHostKeyChecking=no"
258-
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
259-
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
260-
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
261-
cd /root/mscclpp; \
262-
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
263-
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
264-
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
265-
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
266-
workingDirectory: '$(System.DefaultWorkingDirectory)'
248+
# - task: Bash@3
249+
# name: RunNcclReduceScatterFallbaclkToNcclTest
250+
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
251+
# inputs:
252+
# targetType: 'inline'
253+
# script: |
254+
# set -e
255+
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
256+
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
257+
# SSH_OPTION="StrictHostKeyChecking=no"
258+
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
259+
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
260+
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
261+
# cd /root/mscclpp; \
262+
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
263+
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
264+
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
265+
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
266+
# workingDirectory: '$(System.DefaultWorkingDirectory)'
267267

268268
- task: AzureCLI@2
269269
name: StopVMSS
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
applyTo: 'python/mscclpp/language/*.py'
3+
---
4+
5+
# Instructions for DSL API Documentation
6+
7+
## Overview
8+
The MSCCL++ DSL (Domain Specific Language) provides a Python API for defining distributed GPU communication patterns. All API functions should have comprehensive Google-style docstrings.
9+
10+
## Documentation Requirements
11+
- Add google-style docstrings to the DSL API functions in the `mscclpp.language` package.
12+
- Ensure that each function's docstring includes:
13+
- A brief description of what the function does.
14+
- Parameters with their types and descriptions.
15+
- Return type and description.
16+
- Any exceptions raised by the function, if applicable.
17+
- Usage examples where appropriate.
18+
19+
## Implementation Steps
20+
1. Open each Python file in the `python.mscclpp.language` folder, exclude `__init__.py` and internal folders.
21+
2. For each function in the file, add a Google-style docstring that follows the documentation requirements outlined above.
22+
3. Ensure that the docstrings are clear, concise, and accurately describe the function's behavior.
23+
4. Review the docstrings for consistency in style and formatting.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
applyTo: '**/*.cc, **/*.cu, **/*.hpp, **/*.py'
3+
---
4+
5+
# Instructions for License Addition
6+
MSCCL++ is licensed under the MIT License. All source files should include the license header at the top.
7+
8+
## License Header
9+
For python source files, add the following license header at the top of each file:
10+
```python
11+
# Copyright (c) Microsoft Corporation.
12+
# Licensed under the MIT License.
13+
```
14+
For C++ source files, add the following license header at the top of each file:
15+
```cpp
16+
// Copyright (c) Microsoft Corporation.
17+
// Licensed under the MIT License.
18+
```
19+
20+
## Implementation Steps
21+
1. Open each source file in the `mscclpp` repo.
22+
2. Check if the license header is already present. Also ensure that the license text is correct and up-to-date.
23+
- For Python files, it should be the Python license header.
24+
- For C++ files, it should be the C++ license header.
25+
- If the license header is missing or incorrect, proceed to the next step.
26+
3. If not, add the appropriate license header at the very top of the file.

.github/workflows/mscclpp-lang.yml

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,24 @@ jobs:
2727
run: |
2828
CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
2929
30-
- name: Copy test script/config to temp directory
31-
run: |
32-
cp python/test/test_generate_mscclpp_lang_result.py $RUNNER_TEMP/
33-
cp python/test/configs/mscclpp_lang_test_config.json $RUNNER_TEMP/
34-
- name: generate outputs
35-
run: |
36-
python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/pr-outputs/
37-
- name: Checkout main branch
38-
uses: actions/checkout@v4
39-
if: github.event_name == 'pull_request' || github.event_name == 'push'
40-
with:
41-
ref: main
42-
- name: Install msccl and dependencies
43-
run: |
44-
CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
45-
- name: generate outputs
46-
run: |
47-
python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/main-outputs/
48-
- name: Compare outputs
49-
run: |
50-
diff -rw $RUNNER_TEMP/tests/main-outputs/ $RUNNER_TEMP/tests/pr-outputs/
30+
# - name: Copy test script/config to temp directory
31+
# run: |
32+
# cp python/test/test_generate_mscclpp_lang_result.py $RUNNER_TEMP/
33+
# cp python/test/configs/mscclpp_lang_test_config.json $RUNNER_TEMP/
34+
# - name: generate outputs
35+
# run: |
36+
# python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/pr-outputs/
37+
# - name: Checkout main branch
38+
# uses: actions/checkout@v4
39+
# if: github.event_name == 'pull_request' || github.event_name == 'push'
40+
# with:
41+
# ref: main
42+
# - name: Install msccl and dependencies
43+
# run: |
44+
# CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
45+
# - name: generate outputs
46+
# run: |
47+
# python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/main-outputs/
48+
# - name: Compare outputs
49+
# run: |
50+
# diff -rw $RUNNER_TEMP/tests/main-outputs/ $RUNNER_TEMP/tests/pr-outputs/

apps/nccl/src/nccl.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,8 @@ static std::vector<mscclpp::SwitchChannel> setupNvlsChannels(
325325
return channels;
326326
}
327327

328-
static std::pair<std::string, executionPlanInstance> loadExecutionPlan(const std::string& filename) {
329-
std::shared_ptr<mscclpp::ExecutionPlan> plan = std::make_shared<mscclpp::ExecutionPlan>(filename);
328+
static std::pair<std::string, executionPlanInstance> loadExecutionPlan(const std::string& filename, int rank) {
329+
std::shared_ptr<mscclpp::ExecutionPlan> plan = std::make_shared<mscclpp::ExecutionPlan>(filename, rank);
330330
std::string collective = plan->collective();
331331
planKey key{plan->minMessageSize(), plan->maxMessageSize(), plan->isInPlace()};
332332
return std::make_pair(collective, executionPlanInstance{key, plan});
@@ -711,7 +711,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
711711
}
712712
for (const auto& entry : std::filesystem::directory_iterator(collectiveDir)) {
713713
if (entry.is_regular_file()) {
714-
auto plan = loadExecutionPlan(entry.path());
714+
auto plan = loadExecutionPlan(entry.path(), rank);
715715
commPtr->executionPlans[plan.first].push_back(plan.second);
716716
}
717717
}

docs/figs/mscclpp_dsl_json_schema.png

51.6 KB
Loading

docs/figs/replication.png

32.7 KB
Loading

0 commit comments

Comments
 (0)