eth-cscs
diff --git a/‎src/cosma/aligned_allocator.hpp‎
Lines changed: 29 additions & 29 deletions b/‎src/cosma/aligned_allocator.hpp‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎src/cosma/context.cpp‎
Lines changed: 54 additions & 47 deletions b/‎src/cosma/context.cpp‎
Lines changed: 54 additions & 47 deletions
@@ -3,11 +3,11 @@
 #include <mpi.h>
 
 #include <cassert>
+#include <cosma/environment_variables.hpp>
+#include <cosma/math_utils.hpp>
 #include <exception>
 #include <iostream>
 #include <limits>
-#include <cosma/math_utils.hpp>
-#include <cosma/environment_variables.hpp>
 
 /*
  * A custom allocator that:
@@ -18,7 +18,7 @@
 namespace cosma {
 template <typename T>
 class aligned_allocator {
-public:
+  public:
     using value_type = T;
     using pointer = value_type *;
     using const_pointer = const value_type *;
@@ -38,10 +38,10 @@ class aligned_allocator {
 
     // the minimum alignment for given type T
     std::size_t min_alignment() {
-        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*));
+        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *));
     }
 
-    // Calculate how many additional elements we have to allocate for an array 
+    // Calculate how many additional elements we have to allocate for an array
     // of length n and data type T.
     static std::size_t get_alignment_padding(std::size_t n) {
         auto alignment = get_alignment();
@@ -50,34 +50,35 @@ class aligned_allocator {
         auto remainder = (n * sizeof(T)) % alignment;
 
         // Convert the padding from bytes to the number of elements
-        remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0;
+        remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0;
 
-        // std::cout << "For size " << n << ", reminder = " << remainder << std::endl;
-        // std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
+        // std::cout << "For size " << n << ", reminder = " << remainder <<
+        // std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
         return remainder;
     }
 
     // allocate memory with alignment specified as a template parameter
     // returns nullptr on failure
-    T* aligned_malloc(std::size_t size) {
+    T *aligned_malloc(std::size_t size) {
         auto alignment = get_alignment();
         // if alignment is disabled, use the standard malloc
         if (alignment <= 0) {
-            return reinterpret_cast<T*>(malloc(size*sizeof(T)));
+            return reinterpret_cast<T *>(malloc(size * sizeof(T)));
         }
         // check if the requested size is a multiple of the alignment
         assert(get_alignment_padding(size) == 0);
         // check if the alignment is >= min_alignment for this data type T
         assert(alignment >= min_alignment());
-        // check if the alignment is a power of 2 and a multiple of sizeof(void*).
+        // check if the alignment is a power of 2 and a multiple of
+        // sizeof(void*).
         assert(math_utils::is_power_of_2(alignment));
         // "Memory alignment must be a power of 2.");
         // This is required for the posix_memalign function.
-        assert(alignment % sizeof(void*) == 0);
+        assert(alignment % sizeof(void *) == 0);
         // "Memory alignment must be a multiple of sizeof(void*)");
         void *ptr;
-        if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) {
-            return reinterpret_cast<T*>(ptr);
+        if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) {
+            return reinterpret_cast<T *>(ptr);
         }
         return nullptr;
     }
@@ -94,38 +95,37 @@ class aligned_allocator {
     pointer allocate(size_type cnt,
                      typename std::allocator<void>::const_pointer = 0) {
         if (cnt > 0) {
-#if !defined(COSMA_USE_UNIFIED_MEMORY) 
-            pointer ptr = aligned_malloc(cnt);
+            pointer ptr;
+            if (!cosma::get_unified_memory()) {
+                ptr = aligned_malloc(cnt);
+#if defined(COSMA_USE_UNIFIED_MEMORY)
+            } else {
+                hipMalloc(&ptr, cnt * sizeof(T));
 #else
-	    pointer ptr;
-	    hipMalloc(&ptr, cnt*sizeof(T));
-	    //hipHostMalloc(&ptr, cnt*sizeof(T), hipHostMallocDefault);
-	    //hipMallocManaged(&ptr, cnt*sizeof(T), hipMemAttachGlobal);
+            }
 #endif
+            }
             return ptr;
         }
         return nullptr;
     }
 
     void deallocate(pointer p, size_type cnt) {
         if (p) {
-#if !defined(COSMA_USE_UNIFIED_MEMORY) 
-            std::free(p);
-#else
-	    hipFree(p);
-	    //hipHostFree(p);
+            if (!cosma::get_unified_memory())
+                std::free(p);
+#ifdef defined(COSMA_USE_UNIFIED_MEMORY)
+            else
+                hipFree(p);
 #endif
-
         }
     }
 
     size_type max_size() const {
         return std::numeric_limits<size_type>::max() / sizeof(T);
     }
 
-    void construct(pointer p, const T &t) {
-        new (p) T(t);
-    }
+    void construct(pointer p, const T &t) { new (p) T(t); }
 
     void destroy(pointer p) {
         if (p) {
 
@@ -9,7 +9,7 @@
 namespace cosma {
 #ifdef COSMA_HAVE_GPU
 template <typename Scalar>
-gpu::mm_handle<Scalar>* cosma_context<Scalar>::get_gpu_context() {
+gpu::mm_handle<Scalar> *cosma_context<Scalar>::get_gpu_context() {
     return gpu_ctx_.get();
 }
 #endif
@@ -21,26 +21,29 @@ cosma_context<Scalar>::cosma_context() {
     overlap_comm_and_comp = get_overlap_comm_and_comp();
     pin_host_buffers = get_memory_pinning();
 #ifdef COSMA_HAVE_GPU
-    gpu_ctx_ = gpu::make_context<Scalar>(gpu_streams(),
-                                         gpu_max_tile_m(),
-                                         gpu_max_tile_n(),
-                                         gpu_max_tile_k());
+    gpu_ctx_ = gpu::make_context<Scalar>(
+        gpu_streams(), gpu_max_tile_m(), gpu_max_tile_n(), gpu_max_tile_k());
 #endif
 }
 
 template <typename Scalar>
-cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
-    cpu_memory_limit = (long long) cpu_mem_limit;
+cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit,
+                                     int streams,
+                                     int tile_m,
+                                     int tile_n,
+                                     int tile_k) {
+    cpu_memory_limit = (long long)cpu_mem_limit;
     adapt_to_scalapack_strategy = get_adapt_strategy();
     overlap_comm_and_comp = get_overlap_comm_and_comp();
     pin_host_buffers = get_memory_pinning();
     memory_pool_.amortization = get_memory_pool_amortization();
     // do not reserve nor resize the memory pool
     // let this just serve as the upper bound when creating a strategy
-    // because otherwise, it might reserve/resize to much more than the problem requires
-    // memory_pool_.resize(cpu_mem_limit);
+    // because otherwise, it might reserve/resize to much more than the problem
+    // requires memory_pool_.resize(cpu_mem_limit);
 #ifdef COSMA_HAVE_GPU
     gpu_ctx_ = gpu::make_context<Scalar>(streams, tile_m, tile_n, tile_k);
+    gpu_ctx_.use_unified_memory_ = cosma::get_unified_memory();
 #else
     std::cout << "Ignoring parameters in make_context. These parameters only "
                  "used in the CPU version."
@@ -59,24 +62,30 @@ cosma_context<Scalar>::~cosma_context() {
 }
 
 template <typename Scalar>
-memory_pool<Scalar>& cosma_context<Scalar>::get_memory_pool() {
+memory_pool<Scalar> &cosma_context<Scalar>::get_memory_pool() {
     return memory_pool_;
 }
 
+template <typename Scalar>
+bool cosma_context<Scalar>::unified_memory() {
+    return unified_memory_;
+}
+
 template <typename Scalar>
 long long cosma_context<Scalar>::get_cpu_memory_limit() {
     return cpu_memory_limit;
 }
 
 template <typename Scalar>
-cosma::communicator* cosma_context<Scalar>::get_cosma_comm() {
+cosma::communicator *cosma_context<Scalar>::get_cosma_comm() {
     return prev_cosma_comm.get();
 }
 
 template <typename Scalar>
 void cosma_context<Scalar>::register_state(MPI_Comm comm,
                                            const Strategy strategy) {
-    if (comm == MPI_COMM_NULL) return;
+    if (comm == MPI_COMM_NULL)
+        return;
 
     int same_comm = 0;
 
@@ -90,38 +99,31 @@ void cosma_context<Scalar>::register_state(MPI_Comm comm,
         MPI_Comm prev_comm = prev_cosma_comm->full_comm();
         int comm_compare;
         MPI_Comm_compare(prev_comm, comm, &comm_compare);
-        same_comm = comm_compare == MPI_CONGRUENT ||
-                    comm_compare == MPI_IDENT;
+        same_comm = comm_compare == MPI_CONGRUENT || comm_compare == MPI_IDENT;
 
-  bool same_strategy = strategy == prev_strategy;
+        bool same_strategy = strategy == prev_strategy;
 
         // if same_comm and same strategy -> reuse the communicators
         if (!same_comm || !same_strategy) {
             prev_strategy = strategy;
 
             PE(preprocessing_communicators);
-            prev_cosma_comm = std::make_unique<cosma::communicator>(strategy, comm);
+            prev_cosma_comm =
+                std::make_unique<cosma::communicator>(strategy, comm);
             PL();
 
-      memory_pool_.unpin_all();
-      memory_pool_.already_pinned = false;
-      memory_pool_.resized = false;
+            memory_pool_.unpin_all();
+            memory_pool_.already_pinned = false;
+            memory_pool_.resized = false;
         }
     }
 
     // if this rank is not taking part in multiply, return
     // if (prev_cosma_comm->is_idle()) return;
 
 #ifdef COSMA_HAVE_GPU
-    if (
-            !prev_cosma_comm->is_idle()
-                &&
-            !memory_pool_.resized
-                &&
-            same_comm
-                &&
-            strategy == prev_strategy
-        ) {
+    if (!prev_cosma_comm->is_idle() && !memory_pool_.resized && same_comm &&
+        strategy == prev_strategy) {
         memory_pool_.already_pinned = true;
     }
 #endif
@@ -139,8 +141,13 @@ context<Scalar> make_context() {
 }
 
 template <typename Scalar>
-context<Scalar> make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
-    return std::make_unique<cosma_context<Scalar>>(cpu_mem_limit, streams, tile_m, tile_n, tile_k);
+context<Scalar> make_context(size_t cpu_mem_limit,
+                             int streams,
+                             int tile_m,
+                             int tile_n,
+                             int tile_k) {
+    return std::make_unique<cosma_context<Scalar>>(
+        cpu_mem_limit, streams, tile_m, tile_n, tile_k);
 }
 
 // Meyer's singleton, thread-safe in C++11, but not in C++03.
@@ -171,29 +178,29 @@ template context<zfloat> make_context();
 template context<zdouble> make_context();
 
 template context<float> make_context(size_t cpu_mem_limit,
-                                            int streams,
-                                            int tile_m,
-                                            int tile_n,
-                                            int tile_k);
+                                     int streams,
+                                     int tile_m,
+                                     int tile_n,
+                                     int tile_k);
 template context<double> make_context(size_t cpu_mem_limit,
-                                             int streams,
-                                             int tile_m,
-                                             int tile_n,
-                                             int tile_k);
+                                      int streams,
+                                      int tile_m,
+                                      int tile_n,
+                                      int tile_k);
 template context<zfloat> make_context(size_t cpu_mem_limit,
-                                             int streams,
-                                             int tile_m,
-                                             int tile_n,
-                                             int tile_k);
+                                      int streams,
+                                      int tile_m,
+                                      int tile_n,
+                                      int tile_k);
 template context<zdouble> make_context(size_t cpu_mem_limit,
-                                              int streams,
-                                              int tile_m,
-                                              int tile_n,
-                                              int tile_k);
+                                       int streams,
+                                       int tile_m,
+                                       int tile_n,
+                                       int tile_k);
 
 // template instantiation for get_context_instance
 template global_context<float> get_context_instance();
 template global_context<double> get_context_instance();
 template global_context<zfloat> get_context_instance();
 template global_context<zdouble> get_context_instance();
-}
+} // namespace cosma