Skip to content

Commit 2b92b69

Browse files
[RCCL] Implicit launch order test for MPI unit test.
* Added ImplicitLaunchOrder Test * Added multiple communicators to implcit order test * using base class create communicator function * Added Implicit launch order test case to test_runner * Used MPI test framerwork APIs - to handle resources - used macros to get rid of if/else conditions - removed redundant hipDeviceSynchronize and MPI_Barrier() [rocm-systems] ROCm/rocm-systems#3822 (commit 8af4186)
1 parent 212d826 commit 2b92b69

File tree

3 files changed

+247
-0
lines changed

3 files changed

+247
-0
lines changed

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ if(BUILD_TESTS)
229229
transport/NetMPITests.cpp
230230
transport/ShmMPITests.cpp
231231
transport/NetIbMPITests.cpp
232+
ImplicitLaunchOrderMPITests.cpp
232233
)
233234

234235
# Create the MPI test executable
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/*************************************************************************
2+
* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#include "DeviceBufferHelpers.hpp"
8+
#include "MPITestBase.hpp"
9+
#include "ResourceGuards.hpp"
10+
#include "TestChecks.hpp"
11+
12+
#include <cstdlib>
13+
#include <vector>
14+
15+
#ifdef MPI_TESTS_ENABLED
16+
17+
using namespace MPITestConstants;
18+
using namespace RCCLTestGuards;
19+
using namespace RCCLTestHelpers;
20+
21+
namespace ImplicitLaunchOrderConstants
22+
{
23+
constexpr size_t kBufferElements = 64 * 1024;
24+
constexpr size_t kBufferSize = kBufferElements * sizeof(float);
25+
constexpr int kNumCommunicators = 4;
26+
constexpr int kIterations = 20;
27+
constexpr float kValidationEpsilon = 1e-3f;
28+
} // namespace ImplicitLaunchOrderConstants
29+
30+
using namespace ImplicitLaunchOrderConstants;
31+
32+
class ImplicitLaunchOrderMPITest : public MPITestBase
33+
{
34+
protected:
35+
std::vector<NcclCommAutoGuard> comm_guards_;
36+
std::vector<HipStreamAutoGuard> stream_guards_;
37+
std::vector<DeviceBufferAutoGuard> buffer_guards_;
38+
39+
void SetUp() override
40+
{
41+
MPITestBase::SetUp();
42+
comm_guards_.clear();
43+
stream_guards_.clear();
44+
buffer_guards_.clear();
45+
}
46+
47+
void TearDown() override
48+
{
49+
// Destroy child comms before parent (cleaned up by base class)
50+
comm_guards_.clear();
51+
buffer_guards_.clear();
52+
stream_guards_.clear();
53+
MPITestBase::TearDown();
54+
}
55+
56+
ncclResult_t allocateStreams(int num_streams)
57+
{
58+
stream_guards_.reserve(num_streams);
59+
for(int i = 0; i < num_streams; i++)
60+
{
61+
hipStream_t stream{};
62+
HIPCHECK(hipStreamCreate(&stream));
63+
stream_guards_.push_back(makeStreamAutoGuard(stream));
64+
}
65+
return ncclSuccess;
66+
}
67+
68+
ncclResult_t allocateBuffers(int num_buffers)
69+
{
70+
buffer_guards_.reserve(num_buffers);
71+
for(int i = 0; i < num_buffers; i++)
72+
{
73+
void* buf = nullptr;
74+
HIPCHECK(hipMalloc(&buf, kBufferSize));
75+
buffer_guards_.push_back(makeDeviceBufferAutoGuard(buf));
76+
}
77+
return ncclSuccess;
78+
}
79+
80+
ncclResult_t splitCommunicators(int num_comms)
81+
{
82+
comm_guards_.reserve(num_comms);
83+
ncclComm_t parent = getActiveCommunicator();
84+
int rank = MPIEnvironment::world_rank;
85+
86+
for(int i = 0; i < num_comms; i++)
87+
{
88+
ncclComm_t comm{};
89+
RCCL_TEST_CHECK(ncclCommSplit(parent, 0, rank, &comm, nullptr));
90+
comm_guards_.push_back(makeCommAutoGuard(comm));
91+
}
92+
return ncclSuccess;
93+
}
94+
95+
static bool isImplicitLaunchOrderEnabled()
96+
{
97+
const char* env = getenv("NCCL_LAUNCH_ORDER_IMPLICIT");
98+
return env != nullptr && atoi(env) != 0;
99+
}
100+
101+
ncclResult_t runMultiCommChain()
102+
{
103+
HIPCHECK(initializeBufferWithPattern<float>(
104+
buffer_guards_[0].get(),
105+
kBufferElements,
106+
[rank = MPIEnvironment::world_rank](size_t) {
107+
return static_cast<float>(rank + 1);
108+
}));
109+
110+
for(int i = 1; i <= kNumCommunicators; i++)
111+
{
112+
HIPCHECK(zeroInitializeBuffer<float>(buffer_guards_[i].get(), kBufferElements));
113+
}
114+
115+
for(int i = 0; i < kNumCommunicators; i++)
116+
{
117+
RCCL_TEST_CHECK(ncclAllReduce(buffer_guards_[i].get(),
118+
buffer_guards_[i + 1].get(),
119+
kBufferElements,
120+
getNcclDataType<float>(),
121+
ncclSum,
122+
comm_guards_[i].get(),
123+
stream_guards_[i].get()));
124+
}
125+
126+
for(int i = 0; i < kNumCommunicators; i++)
127+
{
128+
HIPCHECK(hipStreamSynchronize(stream_guards_[i].get()));
129+
}
130+
131+
return ncclSuccess;
132+
}
133+
};
134+
135+
TEST_F(ImplicitLaunchOrderMPITest, MultiCommunicatorChain)
136+
{
137+
ASSERT_TRUE(validateTestPrerequisites(kMinProcessesForMPI,
138+
kNoProcessLimit,
139+
kNoPowerOfTwoRequired,
140+
1,
141+
kRequireSingleNode))
142+
<< "Test requirements not met";
143+
144+
bool implicit_order_enabled = isImplicitLaunchOrderEnabled();
145+
146+
TEST_INFO("NCCL_LAUNCH_ORDER_IMPLICIT=%s", implicit_order_enabled ? "1" : "0");
147+
TEST_INFO("Communicators: %d, Buffer: %zu KB, Iterations: %d",
148+
kNumCommunicators, kBufferSize / 1024, kIterations);
149+
150+
ASSERT_MPI_EQ(ncclSuccess, allocateStreams(kNumCommunicators));
151+
ASSERT_MPI_EQ(ncclSuccess, allocateBuffers(kNumCommunicators + 1));
152+
ASSERT_MPI_EQ(ncclSuccess, createTestCommunicator());
153+
ASSERT_MPI_EQ(ncclSuccess, splitCommunicators(kNumCommunicators));
154+
155+
int nranks = MPIEnvironment::world_size;
156+
157+
// Expected: sum(1..nranks) * nranks^(numComms-1)
158+
double expected_value = static_cast<double>(nranks * (nranks + 1) / 2);
159+
for(int i = 1; i < kNumCommunicators; i++)
160+
{
161+
expected_value *= static_cast<double>(nranks);
162+
}
163+
164+
float expected_f = static_cast<float>(expected_value);
165+
166+
int correct_count = 0;
167+
int wrong_count = 0;
168+
bool all_same = true;
169+
float first_result = 0.0f;
170+
171+
for(int iter = 0; iter < kIterations; iter++)
172+
{
173+
ASSERT_MPI_EQ(ncclSuccess, runMultiCommChain());
174+
175+
bool correct = verifyBufferData<float>(
176+
buffer_guards_[kNumCommunicators].get(),
177+
kBufferElements,
178+
[expected_f](size_t) { return expected_f; },
179+
0,
180+
static_cast<double>(kValidationEpsilon * expected_value));
181+
182+
if(correct)
183+
correct_count++;
184+
else
185+
wrong_count++;
186+
187+
auto [dl_err, host_data] = downloadBuffer<float>(
188+
buffer_guards_[kNumCommunicators].get(), 1);
189+
ASSERT_MPI_EQ(dl_err, hipSuccess);
190+
191+
if(iter == 0)
192+
first_result = host_data[0];
193+
else if(std::abs(host_data[0] - first_result) > kValidationEpsilon * expected_value)
194+
all_same = false;
195+
}
196+
197+
TEST_INFO("Expected: %.0f, Correct: %d/%d, Consistent: %s",
198+
expected_value, correct_count, kIterations, all_same ? "yes" : "no");
199+
200+
if(implicit_order_enabled)
201+
{
202+
EXPECT_EQ(correct_count, kIterations)
203+
<< "With NCCL_LAUNCH_ORDER_IMPLICIT=1, all iterations should be correct";
204+
EXPECT_TRUE(all_same)
205+
<< "With NCCL_LAUNCH_ORDER_IMPLICIT=1, results should be consistent";
206+
}
207+
else
208+
{
209+
if(wrong_count > 0 || !all_same)
210+
{
211+
TEST_INFO("Race detected: %d wrong, %s",
212+
wrong_count, all_same ? "consistent" : "inconsistent");
213+
}
214+
else
215+
{
216+
TEST_INFO("No race detected (non-deterministic)");
217+
}
218+
}
219+
}
220+
221+
#endif // MPI_TESTS_ENABLED

tools/scripts/test_runner/configs/mi300x_mellanox_ib.json

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,25 @@
433433
"test_filter": "AltRsmiTest.*"
434434
}
435435
]
436+
},
437+
"implicit_launch_order": {
438+
"extends": "default",
439+
"is_gtest": true,
440+
"binary": "rccl-UnitTestsMPI",
441+
"num_ranks": 8,
442+
"num_nodes": 1,
443+
"num_gpus": 8,
444+
"timeout": 300,
445+
"env_variables": {
446+
"NCCL_LAUNCH_ORDER_IMPLICIT": "1"
447+
},
448+
"tests": [
449+
{
450+
"name": "ImplicitLaunchOrder_MultiCommunicatorChain",
451+
"description": "Test NCCL_LAUNCH_ORDER_IMPLICIT serialization across multiple communicators",
452+
"test_filter": "ImplicitLaunchOrderMPITest.MultiCommunicatorChain"
453+
}
454+
]
436455
}
437456
},
438457
"test_suites": [
@@ -501,6 +520,12 @@
501520
"description": "All Alternative RSMI tests using public API only",
502521
"config": "alt_rsmi_tests",
503522
"enabled": true
523+
},
524+
{
525+
"name": "Implicit Launch Order Tests",
526+
"description": "Test NCCL_LAUNCH_ORDER_IMPLICIT for multi-communicator serialization",
527+
"config": "implicit_launch_order",
528+
"enabled": true
504529
}
505530
]
506531
}

0 commit comments

Comments
 (0)