Skip to content

Commit 3f01d89

Browse files
authored
Multi-Node rocshmem_finalize() bug (#138)
1 parent ca5fdd4 commit 3f01d89

18 files changed

+182
-244
lines changed

examples/rocshmem_allreduce_test.cc

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

@@ -123,16 +113,13 @@ int main (int argc, char **argv)
123113
nelem = atoi(argv[1]);
124114
}
125115

126-
int my_pe = rocshmem_my_pe();
127-
int npes = rocshmem_n_pes();
128-
129-
int ndevices, my_device = 0;
130-
CHECK_HIP(hipGetDeviceCount(&ndevices));
131-
my_device = my_pe % ndevices;
132-
CHECK_HIP(hipSetDevice(my_device));
116+
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
133117

134118
rocshmem_init();
135119

120+
int my_pe = rocshmem_my_pe();
121+
int npes = rocshmem_n_pes();
122+
136123
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
137124
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
138125
if (NULL == source || NULL == dest) {

examples/rocshmem_alltoall_test.cc

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

@@ -128,16 +118,13 @@ int main (int argc, char **argv)
128118
nelem = atoi(argv[1]);
129119
}
130120

131-
int my_pe = rocshmem_my_pe();
132-
int npes = rocshmem_n_pes();
133-
134-
int ndevices, my_device = 0;
135-
CHECK_HIP(hipGetDeviceCount(&ndevices));
136-
my_device = my_pe % ndevices;
137-
CHECK_HIP(hipSetDevice(my_device));
121+
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
138122

139123
rocshmem_init();
140124

125+
int my_pe = rocshmem_my_pe();
126+
int npes = rocshmem_n_pes();
127+
141128
int *source = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
142129
int *dest = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
143130
if (NULL == source || NULL == dest) {

examples/rocshmem_broadcast_test.cc

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

@@ -121,16 +111,13 @@ int main(int argc, char **argv)
121111
nelem = atoi(argv[1]);
122112
}
123113

124-
int my_pe = rocshmem_my_pe();
125-
int npes = rocshmem_n_pes();
126-
127-
int ndevices, my_device = 0;
128-
CHECK_HIP(hipGetDeviceCount(&ndevices));
129-
my_device = my_pe % ndevices;
130-
CHECK_HIP(hipSetDevice(my_device));
114+
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
131115

132116
rocshmem_init();
133117

118+
int my_pe = rocshmem_my_pe();
119+
int npes = rocshmem_n_pes();
120+
134121
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
135122
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
136123
if (NULL == source || NULL == dest) {

examples/rocshmem_getmem_test.cc

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

@@ -76,8 +66,8 @@ __global__ void simple_getmem_test(int *src, int *dst, size_t nelem)
7666

7767
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
7868
if (threadId == 0) {
79-
int rank = rocshmem_my_pe();
80-
int peer = rank ? 0 : 1;
69+
int my_pe = rocshmem_my_pe();
70+
int peer = my_pe ? 0 : 1;
8171
rocshmem_getmem(dst, src, nelem * sizeof(int), peer);
8272
rocshmem_quiet();
8373
}
@@ -90,19 +80,19 @@ __global__ void simple_getmem_test(int *src, int *dst, size_t nelem)
9080

9181
int main (int argc, char **argv)
9282
{
93-
int rank = rocshmem_my_pe();
94-
int ndevices, my_device = 0;
95-
CHECK_HIP(hipGetDeviceCount(&ndevices));
96-
my_device = rank % ndevices;
97-
CHECK_HIP(hipSetDevice(my_device));
9883
int nelem = MAX_ELEM;
9984

10085
if (argc > 1) {
10186
nelem = atoi(argv[1]);
10287
}
10388

89+
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
90+
10491
rocshmem_init();
92+
93+
int my_pe = rocshmem_my_pe();
10594
int npes = rocshmem_n_pes();
95+
10696
int *src = (int *)rocshmem_malloc(nelem * sizeof(int));
10797
int *dst = (int *)rocshmem_malloc(nelem * sizeof(int));
10898
if (NULL == src || NULL == dst) {
@@ -128,7 +118,7 @@ int main (int argc, char **argv)
128118
if (dst[i] != 0) {
129119
pass = false;
130120
#if VERBOSE
131-
printf("[%d] Error in element %d expected 0 got %d\n", rank, i, dst[i]);
121+
printf("[%d] Error in element %d expected 0 got %d\n", my_pe, i, dst[i]);
132122
#endif
133123
}
134124
}

examples/rocshmem_init_attr_test.cc

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

examples/rocshmem_put_signal_test.cc

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,9 @@
5454
5555
*/
5656

57-
#include <iostream>
58-
59-
#include <hip/hip_runtime_api.h>
60-
#include <hip/hip_runtime.h>
6157
#include <rocshmem/rocshmem.hpp>
6258

63-
#define CHECK_HIP(condition) { \
64-
hipError_t error = condition; \
65-
if(error != hipSuccess){ \
66-
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
67-
MPI_Abort(MPI_COMM_WORLD, error); \
68-
} \
69-
}
59+
#include "util.h"
7060

7161
using namespace rocshmem;
7262

@@ -95,20 +85,20 @@ __global__ void simple_put_signal_test(uint64_t *data, uint64_t *message, size_t
9585

9686
int main (int argc, char **argv)
9787
{
98-
int rank = rocshmem_my_pe();
99-
int ndevices, my_device = 0;
100-
CHECK_HIP(hipGetDeviceCount(&ndevices));
101-
my_device = rank % ndevices;
102-
CHECK_HIP(hipSetDevice(my_device));
10388
int nelem = MAX_ELEM;
10489

10590
if (argc > 1) {
10691
nelem = atoi(argv[1]);
10792
}
10893

94+
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
95+
10996
rocshmem_init();
97+
98+
int my_pe = rocshmem_my_pe();
11099
int npes = rocshmem_n_pes();
111-
int dst_pe = (rank + 1) % npes;
100+
101+
int dst_pe = (my_pe + 1) % npes;
112102
uint64_t *message = (uint64_t*)rocshmem_malloc(nelem * sizeof(uint64_t));
113103
uint64_t *data = (uint64_t*)rocshmem_malloc(nelem * sizeof(uint64_t));
114104
uint64_t *sig_addr = (uint64_t*)rocshmem_malloc(sizeof(uint64_t));
@@ -123,14 +113,14 @@ int main (int argc, char **argv)
123113
}
124114

125115
for (int i=0; i<nelem; i++) {
126-
message[i] = rank;
116+
message[i] = my_pe;
127117
}
128118

129119
CHECK_HIP(hipMemset(data, 0, (nelem * sizeof(uint64_t))));
130120
CHECK_HIP(hipDeviceSynchronize());
131121

132122
int threadsPerBlock=256;
133-
simple_put_signal_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(data, message, nelem, sig_addr, rank, dst_pe);
123+
simple_put_signal_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(data, message, nelem, sig_addr, my_pe, dst_pe);
134124
rocshmem_barrier_all();
135125
CHECK_HIP(hipDeviceSynchronize());
136126

@@ -139,11 +129,11 @@ int main (int argc, char **argv)
139129
if (data[i] != 0) {
140130
pass = false;
141131
#if VERBOSE
142-
printf("[%d] Error in element %d expected 0 got %d\n", rank, i, dst[i]);
132+
printf("[%d] Error in element %d expected 0 got %d\n", my_pe, i, dst[i]);
143133
#endif
144134
}
145135
}
146-
printf("[%d] Test %s \t %s\n", rank, argv[0], pass ? "[PASS]" : "[FAIL]");
136+
printf("[%d] Test %s \t %s\n", my_pe, argv[0], pass ? "[PASS]" : "[FAIL]");
147137

148138
rocshmem_free(data);
149139
rocshmem_free(message);

examples/util.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/******************************************************************************
2+
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22+
* IN THE SOFTWARE.
23+
*****************************************************************************/
24+
25+
#ifndef __ROCSHMEM_EXAMPLES_UTIL_H__
26+
#define __ROCSHMEM_EXAMPLES_UTIL_H__
27+
28+
#include <iostream>
29+
30+
#include <hip/hip_runtime_api.h>
31+
#include <hip/hip_runtime.h>
32+
33+
#define CHECK_HIP(condition) { \
34+
hipError_t error = condition; \
35+
if(error != hipSuccess){ \
36+
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
37+
MPI_Abort(MPI_COMM_WORLD, error); \
38+
} \
39+
}
40+
41+
static int get_launcher_local_rank() {
42+
char *local_rank_str = nullptr;
43+
44+
local_rank_str = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
45+
if (nullptr != local_rank_str) {
46+
return atoi(local_rank_str);
47+
}
48+
49+
return -1;
50+
}
51+
52+
#endif /* __ROCSHMEM_EXAMPLES_UTIL_H__ */

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ target_sources(
3232
backend_bc.cpp
3333
context_host.cpp
3434
context_device.cpp
35-
mpi_init_singleton.cpp
35+
mpi_instance.cpp
3636
rocshmem_gpu.cpp
3737
rocshmem.cpp
3838
team.cpp

src/backend_bc.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,7 @@ Backend::Backend(MPI_Comm comm) : heap{comm} {
8686
}
8787

8888
void Backend::init_mpi_once(MPI_Comm comm) {
89-
int init_done{};
90-
NET_CHECK(MPI_Initialized(&init_done));
91-
92-
int provided{};
93-
if (!init_done) {
94-
NET_CHECK(MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided));
95-
if (provided != MPI_THREAD_MULTIPLE) {
96-
fprintf(stderr, "MPI_THREAD_MULTIPLE support disabled.\n");
97-
}
98-
}
9989
if (comm == MPI_COMM_NULL) comm = MPI_COMM_WORLD;
100-
10190
NET_CHECK(MPI_Comm_dup(comm, &backend_comm));
10291
NET_CHECK(MPI_Comm_size(backend_comm, &num_pes));
10392
NET_CHECK(MPI_Comm_rank(backend_comm, &my_pe));

src/memory/remote_heap_info.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,8 @@ class CommunicatorMPI {
5555
CommunicatorMPI(char* heap_base, size_t heap_size,
5656
MPI_Comm comm = MPI_COMM_WORLD)
5757
: comm_{comm} {
58-
int initialized;
59-
MPI_Initialized(&initialized);
60-
if (!initialized) {
61-
int provided;
62-
MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided);
63-
}
6458
MPI_Comm_rank(comm_, &my_pe_);
6559
MPI_Comm_size(comm_, &num_pes_);
66-
6760
heap_window_info_ = WindowInfo(comm_, heap_base, heap_size);
6861
}
6962

0 commit comments

Comments
 (0)