Skip to content

Commit 0bbbc47

Browse files
author
Mathieu Taillefumier
committed
Unified memory can be controlled at runtime using the env variable COSMA_GPU_UNIFIED_MEMORY. It is off by default
1 parent 9b2735a commit 0bbbc47

File tree

7 files changed

+276
-250
lines changed

7 files changed

+276
-250
lines changed

src/cosma/aligned_allocator.hpp

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
#include <mpi.h>
44

55
#include <cassert>
6+
#include <cosma/environment_variables.hpp>
7+
#include <cosma/math_utils.hpp>
68
#include <exception>
79
#include <iostream>
810
#include <limits>
9-
#include <cosma/math_utils.hpp>
10-
#include <cosma/environment_variables.hpp>
1111

1212
/*
1313
* A custom allocator that:
@@ -18,7 +18,7 @@
1818
namespace cosma {
1919
template <typename T>
2020
class aligned_allocator {
21-
public:
21+
public:
2222
using value_type = T;
2323
using pointer = value_type *;
2424
using const_pointer = const value_type *;
@@ -38,10 +38,10 @@ class aligned_allocator {
3838

3939
// the minimum alignment for given type T
4040
std::size_t min_alignment() {
41-
return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*));
41+
return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *));
4242
}
4343

44-
// Calculate how many additional elements we have to allocate for an array
44+
// Calculate how many additional elements we have to allocate for an array
4545
// of length n and data type T.
4646
static std::size_t get_alignment_padding(std::size_t n) {
4747
auto alignment = get_alignment();
@@ -50,34 +50,35 @@ class aligned_allocator {
5050
auto remainder = (n * sizeof(T)) % alignment;
5151

5252
// Convert the padding from bytes to the number of elements
53-
remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0;
53+
remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0;
5454

55-
// std::cout << "For size " << n << ", reminder = " << remainder << std::endl;
56-
// std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
55+
// std::cout << "For size " << n << ", reminder = " << remainder <<
56+
// std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
5757
return remainder;
5858
}
5959

6060
// allocate memory with alignment specified as a template parameter
6161
// returns nullptr on failure
62-
T* aligned_malloc(std::size_t size) {
62+
T *aligned_malloc(std::size_t size) {
6363
auto alignment = get_alignment();
6464
// if alignment is disabled, use the standard malloc
6565
if (alignment <= 0) {
66-
return reinterpret_cast<T*>(malloc(size*sizeof(T)));
66+
return reinterpret_cast<T *>(malloc(size * sizeof(T)));
6767
}
6868
// check if the requested size is a multiple of the alignment
6969
assert(get_alignment_padding(size) == 0);
7070
// check if the alignment is >= min_alignment for this data type T
7171
assert(alignment >= min_alignment());
72-
// check if the alignment is a power of 2 and a multiple of sizeof(void*).
72+
// check if the alignment is a power of 2 and a multiple of
73+
// sizeof(void*).
7374
assert(math_utils::is_power_of_2(alignment));
7475
// "Memory alignment must be a power of 2.");
7576
// This is required for the posix_memalign function.
76-
assert(alignment % sizeof(void*) == 0);
77+
assert(alignment % sizeof(void *) == 0);
7778
// "Memory alignment must be a multiple of sizeof(void*)");
7879
void *ptr;
79-
if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) {
80-
return reinterpret_cast<T*>(ptr);
80+
if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) {
81+
return reinterpret_cast<T *>(ptr);
8182
}
8283
return nullptr;
8384
}
@@ -94,38 +95,37 @@ class aligned_allocator {
9495
pointer allocate(size_type cnt,
9596
typename std::allocator<void>::const_pointer = 0) {
9697
if (cnt > 0) {
97-
#if !defined(COSMA_USE_UNIFIED_MEMORY)
98-
pointer ptr = aligned_malloc(cnt);
98+
pointer ptr;
99+
if (!cosma::get_unified_memory()) {
100+
ptr = aligned_malloc(cnt);
101+
#if defined(COSMA_USE_UNIFIED_MEMORY)
102+
} else {
103+
hipMalloc(&ptr, cnt * sizeof(T));
99104
#else
100-
pointer ptr;
101-
hipMalloc(&ptr, cnt*sizeof(T));
102-
//hipHostMalloc(&ptr, cnt*sizeof(T), hipHostMallocDefault);
103-
//hipMallocManaged(&ptr, cnt*sizeof(T), hipMemAttachGlobal);
105+
}
104106
#endif
107+
}
105108
return ptr;
106109
}
107110
return nullptr;
108111
}
109112

110113
void deallocate(pointer p, size_type cnt) {
111114
if (p) {
112-
#if !defined(COSMA_USE_UNIFIED_MEMORY)
113-
std::free(p);
114-
#else
115-
hipFree(p);
116-
//hipHostFree(p);
115+
if (!cosma::get_unified_memory())
116+
std::free(p);
117+
#ifdef defined(COSMA_USE_UNIFIED_MEMORY)
118+
else
119+
hipFree(p);
117120
#endif
118-
119121
}
120122
}
121123

122124
size_type max_size() const {
123125
return std::numeric_limits<size_type>::max() / sizeof(T);
124126
}
125127

126-
void construct(pointer p, const T &t) {
127-
new (p) T(t);
128-
}
128+
void construct(pointer p, const T &t) { new (p) T(t); }
129129

130130
void destroy(pointer p) {
131131
if (p) {

src/cosma/context.cpp

Lines changed: 54 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
namespace cosma {
1010
#ifdef COSMA_HAVE_GPU
1111
template <typename Scalar>
12-
gpu::mm_handle<Scalar>* cosma_context<Scalar>::get_gpu_context() {
12+
gpu::mm_handle<Scalar> *cosma_context<Scalar>::get_gpu_context() {
1313
return gpu_ctx_.get();
1414
}
1515
#endif
@@ -21,26 +21,29 @@ cosma_context<Scalar>::cosma_context() {
2121
overlap_comm_and_comp = get_overlap_comm_and_comp();
2222
pin_host_buffers = get_memory_pinning();
2323
#ifdef COSMA_HAVE_GPU
24-
gpu_ctx_ = gpu::make_context<Scalar>(gpu_streams(),
25-
gpu_max_tile_m(),
26-
gpu_max_tile_n(),
27-
gpu_max_tile_k());
24+
gpu_ctx_ = gpu::make_context<Scalar>(
25+
gpu_streams(), gpu_max_tile_m(), gpu_max_tile_n(), gpu_max_tile_k());
2826
#endif
2927
}
3028

3129
template <typename Scalar>
32-
cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
33-
cpu_memory_limit = (long long) cpu_mem_limit;
30+
cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit,
31+
int streams,
32+
int tile_m,
33+
int tile_n,
34+
int tile_k) {
35+
cpu_memory_limit = (long long)cpu_mem_limit;
3436
adapt_to_scalapack_strategy = get_adapt_strategy();
3537
overlap_comm_and_comp = get_overlap_comm_and_comp();
3638
pin_host_buffers = get_memory_pinning();
3739
memory_pool_.amortization = get_memory_pool_amortization();
3840
// do not reserve nor resize the memory pool
3941
// let this just serve as the upper bound when creating a strategy
40-
// because otherwise, it might reserve/resize to much more than the problem requires
41-
// memory_pool_.resize(cpu_mem_limit);
42+
// because otherwise, it might reserve/resize to much more than the problem
43+
// requires memory_pool_.resize(cpu_mem_limit);
4244
#ifdef COSMA_HAVE_GPU
4345
gpu_ctx_ = gpu::make_context<Scalar>(streams, tile_m, tile_n, tile_k);
46+
gpu_ctx_.use_unified_memory_ = cosma::get_unified_memory();
4447
#else
4548
std::cout << "Ignoring parameters in make_context. These parameters only "
4649
"used in the CPU version."
@@ -59,24 +62,30 @@ cosma_context<Scalar>::~cosma_context() {
5962
}
6063

6164
template <typename Scalar>
62-
memory_pool<Scalar>& cosma_context<Scalar>::get_memory_pool() {
65+
memory_pool<Scalar> &cosma_context<Scalar>::get_memory_pool() {
6366
return memory_pool_;
6467
}
6568

69+
template <typename Scalar>
70+
bool cosma_context<Scalar>::unified_memory() {
71+
return unified_memory_;
72+
}
73+
6674
template <typename Scalar>
6775
long long cosma_context<Scalar>::get_cpu_memory_limit() {
6876
return cpu_memory_limit;
6977
}
7078

7179
template <typename Scalar>
72-
cosma::communicator* cosma_context<Scalar>::get_cosma_comm() {
80+
cosma::communicator *cosma_context<Scalar>::get_cosma_comm() {
7381
return prev_cosma_comm.get();
7482
}
7583

7684
template <typename Scalar>
7785
void cosma_context<Scalar>::register_state(MPI_Comm comm,
7886
const Strategy strategy) {
79-
if (comm == MPI_COMM_NULL) return;
87+
if (comm == MPI_COMM_NULL)
88+
return;
8089

8190
int same_comm = 0;
8291

@@ -90,38 +99,31 @@ void cosma_context<Scalar>::register_state(MPI_Comm comm,
9099
MPI_Comm prev_comm = prev_cosma_comm->full_comm();
91100
int comm_compare;
92101
MPI_Comm_compare(prev_comm, comm, &comm_compare);
93-
same_comm = comm_compare == MPI_CONGRUENT ||
94-
comm_compare == MPI_IDENT;
102+
same_comm = comm_compare == MPI_CONGRUENT || comm_compare == MPI_IDENT;
95103

96-
bool same_strategy = strategy == prev_strategy;
104+
bool same_strategy = strategy == prev_strategy;
97105

98106
// if same_comm and same strategy -> reuse the communicators
99107
if (!same_comm || !same_strategy) {
100108
prev_strategy = strategy;
101109

102110
PE(preprocessing_communicators);
103-
prev_cosma_comm = std::make_unique<cosma::communicator>(strategy, comm);
111+
prev_cosma_comm =
112+
std::make_unique<cosma::communicator>(strategy, comm);
104113
PL();
105114

106-
memory_pool_.unpin_all();
107-
memory_pool_.already_pinned = false;
108-
memory_pool_.resized = false;
115+
memory_pool_.unpin_all();
116+
memory_pool_.already_pinned = false;
117+
memory_pool_.resized = false;
109118
}
110119
}
111120

112121
// if this rank is not taking part in multiply, return
113122
// if (prev_cosma_comm->is_idle()) return;
114123

115124
#ifdef COSMA_HAVE_GPU
116-
if (
117-
!prev_cosma_comm->is_idle()
118-
&&
119-
!memory_pool_.resized
120-
&&
121-
same_comm
122-
&&
123-
strategy == prev_strategy
124-
) {
125+
if (!prev_cosma_comm->is_idle() && !memory_pool_.resized && same_comm &&
126+
strategy == prev_strategy) {
125127
memory_pool_.already_pinned = true;
126128
}
127129
#endif
@@ -139,8 +141,13 @@ context<Scalar> make_context() {
139141
}
140142

141143
template <typename Scalar>
142-
context<Scalar> make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
143-
return std::make_unique<cosma_context<Scalar>>(cpu_mem_limit, streams, tile_m, tile_n, tile_k);
144+
context<Scalar> make_context(size_t cpu_mem_limit,
145+
int streams,
146+
int tile_m,
147+
int tile_n,
148+
int tile_k) {
149+
return std::make_unique<cosma_context<Scalar>>(
150+
cpu_mem_limit, streams, tile_m, tile_n, tile_k);
144151
}
145152

146153
// Meyer's singleton, thread-safe in C++11, but not in C++03.
@@ -171,29 +178,29 @@ template context<zfloat> make_context();
171178
template context<zdouble> make_context();
172179

173180
template context<float> make_context(size_t cpu_mem_limit,
174-
int streams,
175-
int tile_m,
176-
int tile_n,
177-
int tile_k);
181+
int streams,
182+
int tile_m,
183+
int tile_n,
184+
int tile_k);
178185
template context<double> make_context(size_t cpu_mem_limit,
179-
int streams,
180-
int tile_m,
181-
int tile_n,
182-
int tile_k);
186+
int streams,
187+
int tile_m,
188+
int tile_n,
189+
int tile_k);
183190
template context<zfloat> make_context(size_t cpu_mem_limit,
184-
int streams,
185-
int tile_m,
186-
int tile_n,
187-
int tile_k);
191+
int streams,
192+
int tile_m,
193+
int tile_n,
194+
int tile_k);
188195
template context<zdouble> make_context(size_t cpu_mem_limit,
189-
int streams,
190-
int tile_m,
191-
int tile_n,
192-
int tile_k);
196+
int streams,
197+
int tile_m,
198+
int tile_n,
199+
int tile_k);
193200

194201
// template instantiation for get_context_instance
195202
template global_context<float> get_context_instance();
196203
template global_context<double> get_context_instance();
197204
template global_context<zfloat> get_context_instance();
198205
template global_context<zdouble> get_context_instance();
199-
}
206+
} // namespace cosma

0 commit comments

Comments
 (0)