diff --git a/benchmarks/access_overhead/access_overhead.cpp b/benchmarks/access_overhead/access_overhead.cpp index 15ba67a..aa8ad2c 100644 --- a/benchmarks/access_overhead/access_overhead.cpp +++ b/benchmarks/access_overhead/access_overhead.cpp @@ -117,13 +117,13 @@ struct Access #include -#define LDC_LEAGUE_SIZE 4096 +#define LDC_LEAGUE_SIZE 2 #define LDC_TEAM_SIZE 1 -//#define CHECK_FOR_CORRECTNESS + +#define CHECK_FOR_CORRECTNESS using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; using RemoteView_t = Kokkos::View; -using PlainView_t = Kokkos::View; +using PlainView_t = Kokkos::View; using UnmanagedView_t = Kokkos::View>; using HostView_t = typename RemoteView_t::HostMirror; @@ -53,8 +54,8 @@ using policy_check_t = Kokkos::RangePolicy; // Default values #define default_Mode 0 -#define default_N 134217728 -#define default_Iters 3 +#define default_N 128 +#define default_Iters 1 #define default_RmaOp RMA_GET #define TAG 0 diff --git a/benchmarks/access_overhead/access_overhead_p2p_kernelconf.cpp b/benchmarks/access_overhead/access_overhead_p2p_kernelconf.cpp new file mode 100644 index 0000000..fcba6d7 --- /dev/null +++ b/benchmarks/access_overhead/access_overhead_p2p_kernelconf.cpp @@ -0,0 +1,729 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_FOR_CORRECTNESS + +#define LEAGUE_SIZE 1024 +#define TEAM_SIZE 32 + +#define LDC_LEAGUE_SIZE 4096 +#define LDC_TEAM_SIZE 1 + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using RemoteView_t = Kokkos::View; +using PlainView_t = Kokkos::View; +using UnmanagedView_t = + Kokkos::View>; +using HostView_t = typename RemoteView_t::HostMirror; + +// Tags +struct InitTag {}; +struct UpdateTag {}; +struct UpdateTag_put {}; +struct UpdateTag_get {}; +struct CheckTag {}; + +// Exec policies +using policy_init_t = Kokkos::RangePolicy; +using policy_update_t = Kokkos::RangePolicy; +using policy_update_put_t = Kokkos::RangePolicy; +using policy_update_get_t = Kokkos::RangePolicy; +using team_policy_update_t = Kokkos::TeamPolicy; +using team_policy_get_update_t = Kokkos::TeamPolicy; +using team_policy_put_update_t = Kokkos::TeamPolicy; +using policy_check_t = Kokkos::RangePolicy; + +// Default values +#define default_Mode 0 +#define default_N 4096 +#define default_Iters 3 +#define default_RmaOp RMA_GET +#define default_ts TEAM_SIZE +#define default_ls LEAGUE_SIZE +#define TAG 0 + +std::string modes[4] = {"Kokkos::View", "Kokkos::View-MPIIsCudaAware", + "Kokkos::RemoteView", + "Kokkos::RemoteViewBlockTransfer"}; + +enum { RMA_GET, RMA_PUT }; + +struct Args_t { + int mode = default_Mode; + int N = default_N; + int iters = default_Iters; + int rma_op = default_RmaOp; + int ts = default_ts; + int ls = default_ls; +}; + +void print_help() { + printf("Options (default):\n"); + printf(" -N IARG: (%i) num elements in the vector\n", default_N); + printf(" -I IARG: (%i) num repititions\n", default_Iters); + printf(" -M IARG: (%i) mode (view type)\n", default_Mode); + printf(" -T IARG: (%i) teams ize\n", default_Mode); + printf(" -L IARG: (%i) league size\n", default_Mode); + printf(" -O IARG: (%i) rma operation (0...get, 1...put)\n", default_RmaOp); + printf(" modes:\n"); + printf(" 0: Kokkos (Normal) View\n"); + printf(" 1: Kokkos Remote View\n"); +} + +// read command line args +bool read_args(int argc, char *argv[], Args_t &args) { + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + print_help(); + return false; + } + } + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-N") == 0) args.N = atoi(argv[i + 1]); + if (strcmp(argv[i], "-I") == 0) args.iters = atoi(argv[i + 1]); + if (strcmp(argv[i], "-M") == 0) args.mode = atoi(argv[i + 1]); + if (strcmp(argv[i], "-T") == 0) args.ts = atoi(argv[i + 1]); + if (strcmp(argv[i], "-L") == 0) args.ls = atoi(argv[i + 1]); + if (strcmp(argv[i], "-O") == 0) args.rma_op = atoi(argv[i + 1]); + } + return true; +} + +template +struct Access; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + + int my_rank, other_rank, num_ranks = 0; + + ViewType_t v; + ViewType_t v_tmp; + + int iters_per_team; + int iters_per_team_mod; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + v(std::string(typeid(v).name()), args.N), + v_tmp(std::string(typeid(v).name()) + "_tmp", args.N), + mode(args.mode) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + other_rank = my_rank ^ 1; + assert(num_ranks == 2); + iters_per_team = args.N / LEAGUE_SIZE; + iters_per_team_mod = args.N % LEAGUE_SIZE; + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, typename team_policy_update_t::member_type team) const { + int team_id = team.league_rank(); + int start = team_id * iters_per_team; + int end = start + iters_per_team; + int mod = (team_id == LEAGUE_SIZE - 1) ? iters_per_team_mod : 0; + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, start, end + mod), [&](const int i){ + v(i) += v_tmp(i); + }); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == typename ViewType_t::traits::value_type( + iters * (other_rank + 1) + (my_rank + 1))); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t(0, N), *this); + Kokkos::fence(); + MPI_Barrier(MPI_COMM_WORLD); + + for (int i = 0; i < iters; i++) { + time_a = timer.seconds(); + + if (my_rank == 1) { + auto v_tmp_host = Kokkos::create_mirror_view(v_tmp); + Kokkos::deep_copy(v_tmp_host, v); + MPI_Send(v_tmp_host.data(), N, MPI_DOUBLE, other_rank, TAG, + MPI_COMM_WORLD); + } else { + auto v_tmp_host = Kokkos::create_mirror_view(v_tmp); + MPI_Recv(v_tmp_host.data(), N, MPI_DOUBLE, other_rank, TAG, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(v_tmp, v_tmp_host); + Kokkos::parallel_for( + "access_overhead", + team_policy_update_t(LEAGUE_SIZE, TEAM_SIZE), *this); + Kokkos::fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + if (my_rank == 0) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t(0, N), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } + } +}; + +template +struct Access_CudaAware; + +template +struct Access_CudaAware< + ViewType_t, + typename std::enable_if_t::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + + int my_rank, other_rank, num_ranks = 0; + + ViewType_t v; + ViewType_t v_tmp; + + int iters_per_team; + int iters_per_team_mod; + + Access_CudaAware(Args_t args) + : N(args.N), + iters(args.iters), + v(std::string(typeid(v).name()), args.N), + v_tmp(std::string(typeid(v).name()) + "_tmp", args.N), + mode(args.mode) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + other_rank = my_rank ^ 1; + assert(num_ranks == 2); + iters_per_team = args.N / LEAGUE_SIZE; + iters_per_team_mod = args.N % LEAGUE_SIZE; + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, typename team_policy_update_t::member_type team) const { + int team_id = team.league_rank(); + int start = team_id * iters_per_team; + int end = start + iters_per_team; + int mod = (team_id == LEAGUE_SIZE - 1) ? iters_per_team_mod : 0; + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, start, end + mod), [&](const int i){ + v(i) += v_tmp(i); + }); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == typename ViewType_t::traits::value_type( + iters * (other_rank + 1) + (my_rank + 1))); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t(0, N), *this); + Kokkos::fence(); + MPI_Barrier(MPI_COMM_WORLD); + + for (int i = 0; i < iters; i++) { + time_a = timer.seconds(); + if (my_rank == 1) { + MPI_Send(v.data(), N, MPI_DOUBLE, other_rank, TAG, MPI_COMM_WORLD); + + } else { + MPI_Recv(v_tmp.data(), N, MPI_DOUBLE, other_rank, TAG, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::parallel_for( + "access_overhead", + team_policy_update_t(LEAGUE_SIZE, TEAM_SIZE), *this); + Kokkos::fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + if (my_rank == 0) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t(0, N), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + (modes[mode]).c_str(), N, size, iters, time, gups, bw); + } + } +}; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int rma_op; + + int my_rank, other_rank, num_ranks; + + ViewType_t v; + + int iters_per_team; + int iters_per_team_mod; + + Access(Args_t args) + : N(args.N), iters(args.iters), mode(args.mode), rma_op(args.rma_op) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + assert(num_ranks == 2); + other_rank = my_rank ^ 1; + v = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + auto local_range = + Kokkos::Experimental::get_local_range(num_ranks * args.N); + iters_per_team = (local_range.second - local_range.first) / LEAGUE_SIZE; + iters_per_team_mod = (local_range.second - local_range.first) % LEAGUE_SIZE; + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_get &, const size_t i) const { + v(i) += v(other_rank * N + i); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_get &, typename team_policy_update_t::member_type team) const { + int team_id = team.league_rank(); + int start = team_id * iters_per_team; + int end = start + iters_per_team; + int mod = (team_id == LEAGUE_SIZE - 1) ? iters_per_team_mod : 0; + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, start, end + mod), [&](const int i){ + v(i) += v(other_rank * N + i); + }); + } + + + + KOKKOS_FUNCTION + void operator()(const UpdateTag_put &, typename team_policy_update_t::member_type team) const { + int team_id = team.league_rank(); + int start = team_id * iters_per_team; + int end = start + iters_per_team; + int mod = (team_id == LEAGUE_SIZE - 1) ? iters_per_team_mod : 0; + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, start, end + mod), [&](const int i){ + v(other_rank * N + i) = v(i); + }); + } + + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == typename ViewType_t::traits::value_type( + iters * (other_rank + 1) + (my_rank + 1))); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + auto local_range = + Kokkos::Experimental::get_local_range(num_ranks * v.size()); + Kokkos::parallel_for("access_overhead-init", + policy_init_t(local_range.first, local_range.second), + *this); + RemoteSpace_t().fence(); + + if (rma_op == RMA_GET) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "access_overhead", + team_policy_get_update_t(LEAGUE_SIZE, TEAM_SIZE), *this); + Kokkos::fence(); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else if (rma_op == RMA_PUT) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "access_overhead", + team_policy_put_update_t(local_range.first, local_range.second), + *this); + Kokkos::fence(); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else { + printf("What rma_op is this? Exiting.\n"); + exit(1); + } + + if (rma_op == RMA_GET) { + // check on rank 0 + if (my_rank == 0) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", + policy_check_t(local_range.first, local_range.second), *this); + Kokkos::fence(); +#endif + } + } else { + // check on rank 1 + if (my_rank == 1) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", + policy_check_t(local_range.first, local_range.second), *this); + Kokkos::fence(); +#endif + } + } + + if (my_rank == 0) { + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + if (rma_op == RMA_GET) { + printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } else { + printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } + } + } +}; + +template +struct Access_LDC; + +template +struct Access_LDC< + ViewType_t, + typename std::enable_if_t::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int rma_op; + + int my_rank, other_rank, num_ranks; + + ViewType_t v, v_tmp; + ViewType_t v_subview_remote; + + int iters_per_team; + int iters_per_team_mod; + + + Access_LDC(Args_t args) + : N(args.N), iters(args.iters), mode(args.mode), rma_op(args.rma_op) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + assert(num_ranks == 2); + other_rank = my_rank ^ 1; + v = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + v_tmp = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + auto local_range = + Kokkos::Experimental::get_local_range(num_ranks * args.N); + iters_per_team = (local_range.second - local_range.first) / LEAGUE_SIZE; + iters_per_team_mod = (local_range.second - local_range.first) % LEAGUE_SIZE; + }; + + KOKKOS_FUNCTION + void operator()(const size_t i) const { + double val1 = v_tmp(i); + double val2 = v(i); + printf("debug: %li, %.2f, %.2f\n", i, val1, val2); + } + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == typename ViewType_t::traits::value_type( + iters * (other_rank + 1) + (my_rank + 1))); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, typename team_policy_update_t::member_type team) const { + int team_id = team.league_rank(); + int start = team_id * iters_per_team; + int end = start + iters_per_team; + int mod = (team_id == LEAGUE_SIZE - 1) ? iters_per_team_mod : 0; + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, start, end + mod), [&](const int i){ + v(i) += v_tmp(i); + }); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_get &, + typename team_policy_get_update_t::member_type team) const { + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + auto remote_range = + Kokkos::Experimental::get_range(num_ranks * N, other_rank); + auto v_subview_remote = Kokkos::subview(v, remote_range); + auto v_tmp_subview_local = Kokkos::subview(v_tmp, local_range); + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + team, v_tmp_subview_local, v_subview_remote); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_put &, + typename team_policy_put_update_t::member_type team) const { + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + auto remote_range = + Kokkos::Experimental::get_range(num_ranks * N, other_rank); + auto v_subview_remote = Kokkos::subview(v_tmp, remote_range); + auto v_tmp_subview_local = Kokkos::subview(v, local_range); + Kokkos::Experimental::RemoteSpaces::local_deep_copy(team, v_subview_remote, + v_tmp_subview_local); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + auto remote_range = + Kokkos::Experimental::get_range(num_ranks * N, other_rank); + + Kokkos::parallel_for("access_overhead-init", + policy_init_t(local_range.first, local_range.second), + *this); + Kokkos::fence(); + RemoteSpace_t().fence(); + + if (rma_op == RMA_GET) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "block_transfer", + team_policy_get_update_t(LDC_LEAGUE_SIZE, LDC_TEAM_SIZE), *this); + + Kokkos::fence(); +#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) + Kokkos::parallel_for( + "printf values for debugging", + Kokkos::RangePolicy(local_range.first, local_range.second), + *this); + Kokkos::fence(); +#endif + Kokkos::parallel_for( + "access_overhead", + team_policy_update_t(LEAGUE_SIZE, TEAM_SIZE), *this); + Kokkos::fence(); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + + } else if (rma_op == RMA_PUT) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "block_transfer", + team_policy_put_update_t(LDC_LEAGUE_SIZE, LDC_TEAM_SIZE), *this); + Kokkos::fence(); + RemoteSpace_t().fence(); +#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) + Kokkos::parallel_for( + "printf values for debugging", + Kokkos::RangePolicy(local_range.first, local_range.second), + *this); +#endif + Kokkos::parallel_for( + "access_overhead", + team_policy_update_t(LEAGUE_SIZE, TEAM_SIZE), *this); + Kokkos::fence(); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + + } else { + RemoteSpace_t().fence(); + } + } + } else { + printf("What rma_op is this? Exiting.\n"); + exit(1); + } + +#ifdef CHECK_FOR_CORRECTNESS + if (rma_op == RMA_GET) { + // check on rank 0 + if (my_rank == 0) { + Kokkos::parallel_for( + "access_overhead-check", + policy_check_t(local_range.first, local_range.second), *this); + Kokkos::fence(); + } + } else { + // check on rank 1 + if (my_rank == 1) { + Kokkos::parallel_for( + "access_overhead-check", + policy_check_t(local_range.first, local_range.second), *this); + Kokkos::fence(); + } + } +#endif + + if (my_rank == 0) { + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + if (rma_op == RMA_GET) { + printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } else { + printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } + } + } +}; + +int main(int argc, char *argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + Kokkos::initialize(argc, argv); + + do { + Args_t args; + if (!read_args(argc, argv, args)) { + break; + }; + if (args.mode == 0) { + Access s(args); + s.run(); + } else if (args.mode == 1) { + Access_CudaAware s(args); + s.run(); + } else if (args.mode == 2) { + Access s(args); + s.run(); + } else if (args.mode == 3) { + Access_LDC s(args); + s.run(); + } else { + printf("invalid mode selected (%d)\n", args.mode); + } + } while (false); + + Kokkos::fence(); + + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return 0; +} + +#undef CHECK_FOR_CORRECTNESS diff --git a/benchmarks/access_overhead/scripts/run_over_size.sh b/benchmarks/access_overhead/scripts/run_over_size.sh index 9569dd0..b6ceecc 100644 --- a/benchmarks/access_overhead/scripts/run_over_size.sh +++ b/benchmarks/access_overhead/scripts/run_over_size.sh @@ -15,11 +15,13 @@ FILENAME="${BENCHMARK}_${HASH}.res" echo $FILENAME echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME +export NVSHMEM_SYMMETRIC_SIZE=12884901888 + #run test over size SIZE=$DEFAULT_SIZE for S in $(seq 1 21); do for reps in $(seq 1 3); do - ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME + CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME done let SIZE=$SIZE*2 done @@ -28,7 +30,7 @@ done let SIZE=$DEFAULT_SIZE for S in $(seq 1 21); do for reps in $(seq 1 3); do - ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME + CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME done let SIZE=$SIZE*2 done @@ -37,7 +39,7 @@ done let SIZE=$DEFAULT_SIZE for S in $(seq 1 21); do for reps in $(seq 1 3); do - ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME + CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME done let SIZE=$SIZE*2 done diff --git a/benchmarks/access_overhead/scripts/run_over_size_p2p.sh b/benchmarks/access_overhead/scripts/run_over_size_p2p.sh index 34eb93b..512af76 100644 --- a/benchmarks/access_overhead/scripts/run_over_size_p2p.sh +++ b/benchmarks/access_overhead/scripts/run_over_size_p2p.sh @@ -2,7 +2,7 @@ BENCHMARK=$1 HOST1=$2 HOST2=$3 -DEFAULT_SIZE=33554432 #128 +DEFAULT_SIZE=128 #exports export OMP_PROC_BIND=spread @@ -13,7 +13,7 @@ ITERS=30 #NVLInk (=||=) DEVICE_ID_1=0 -DEVICE_ID_2=1 +DEVICE_ID_2=0 #XBus (Summit-like systems) #DEVICE_ID_1=0 @@ -28,7 +28,8 @@ FILENAME="${BENCHMARK}_${HASH}_p2p.res" echo $FILENAME echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME VARS0="--bind-to core --map-by socket" -VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/11.8.0/gcc/9.3.0/base/c3ajoqf/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888" +#VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/11.8.0/gcc/9.3.0/base/c3ajoqf/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888" +VARS1="--oversubscribe -x LD_LIBRARY_PATH=/home/jciesko/software/nvshmem_src_3.0.6-4_blake/install/lib:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888" # Some more potential optimizations #VARS1="" #-x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x \ diff --git a/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp b/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp index 45c5d0f..4744410 100644 --- a/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp +++ b/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp @@ -119,6 +119,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( auto team_range = Kokkos::pair(size_type(start_offset), size_type(start_offset + team_block)); + if(team_range.first == team_range.second) return; //nothing to be done + // Construct per-team subviews auto src_subview = Kokkos::Impl::get_local_subview(src, team_range); auto dst_subview = Kokkos::Impl::get_local_subview(dst, team_range); diff --git a/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp b/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp index 71f4631..1d3fef2 100644 --- a/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp +++ b/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp @@ -343,8 +343,8 @@ class ViewMapping< typename view_type::size_type offset; offset = switch_to_local_indexing ? total_offset : local_offset; - dst.remote_view_props.total_offset = offset; - + dst.remote_view_props.total_offset = total_offset; + #ifdef KRS_ENABLE_MPISPACE // Subviews propagate MPI_Window of the original view dst.m_handle = ViewDataHandle::assign( @@ -397,10 +397,7 @@ class ViewMapping { KOKKOS_INLINE_FUNCTION auto get_ptr() const { - if (remote_view_props.using_local_indexing) return handle().ptr + remote_view_props.total_offset; - else - return handle().ptr; } template diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 4341181..041cbf5 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -12,13 +12,7 @@ if(NOT googletest_POPULATED) endif() SET(NAME KokkosRemoteSpaces_TestAll) - -#if (KRS_ENABLE_MPISPACE) -# FILE(GLOB TEST_SRCS *.cpp) -# list(FILTER TEST_SRCS EXCLUDE REGEX ".*Test_Atomic\\.cpp$") -#else() - FILE(GLOB TEST_SRCS *.cpp) -#endif() +FILE(GLOB TEST_SRCS *.cpp) add_executable(${NAME} ${TEST_SRCS}) diff --git a/unit_tests/Test_Empty.cpp b/unit_tests/Test_Empty.cpp index 37fb9e1..06c768e 100644 --- a/unit_tests/Test_Empty.cpp +++ b/unit_tests/Test_Empty.cpp @@ -21,6 +21,9 @@ using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; -void test_empty() { RemoteSpace_t::fence(); } +void test_empty() {} -TEST(TEST_CATEGORY, test_empty) { test_empty(); } +TEST(TEST_CATEGORY, test_empty) { + test_empty(); + RemoteSpace_t::fence(); +} diff --git a/unit_tests/Test_LocalDeepCopy.cpp b/unit_tests/Test_LocalDeepCopy.cpp index af110c2..19cfc65 100644 --- a/unit_tests/Test_LocalDeepCopy.cpp +++ b/unit_tests/Test_LocalDeepCopy.cpp @@ -295,7 +295,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -383,7 +383,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -412,7 +412,7 @@ void test_localdeepcopy_withSubview( // Copy from next if (my_rank % 2 == 0) { Kokkos::parallel_for( - "Team", TeamPolicy_t(team_sizes::big, 1), + "Team", TeamPolicy_t(team_sizes::small, 1), KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { Kokkos::Experimental::RemoteSpaces::local_deep_copy( team, v_R_subview_local, v_R_subview_next); @@ -464,7 +464,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -554,7 +554,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -634,7 +634,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -722,7 +722,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -806,7 +806,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -896,7 +896,7 @@ void test_localdeepcopy_withSubview( prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; next_rank = (my_rank + 1) % num_ranks; - if (num_ranks % 2 && num_ranks > 1) return; // skip + if (num_ranks % 2 || num_ranks < 2) return; // skip using ViewRemote_t = Kokkos::View; using ViewHost_t = typename ViewRemote_t::HostMirror; @@ -988,10 +988,10 @@ void test_localdeepcopy_withSubview( test_localdeepcopy_withSubview(12, 15); \ /* 2D with Subviews (put block transfer) */ \ - test_localdeepcopy_withSubview(5, 16); \ /* 2D with Subviews (get block transfer)*/ \ - test_localdeepcopy_withSubview(12, 15); \ /* 2D with Subviews (put block transfer)*/ \ test_localdeepcopy_withSubview +#include + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; + +#define is_Layout_PL(Layout) \ + std::enable_if_t::value> +#define is_Layout_GL(Layout) \ + std::enable_if_t::value> + +#define value(j, k, l, rank, range) \ + (l) + (k) * (size_l) + (j) * (size_l * size_k) + \ + rank *((range.second - range.first) * size_k * size_l) + +#define SUM(start, end, red) \ + for (int i = start; i <= end; i++) red += i; + +template +is_Layout_GL(Layout) test_mdrangepolicy(int x, int y, int z) { + int my_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + int res = 0, res_ref = 0; + int next_rank = (my_rank + 1) % num_ranks; + + int size_j, size_k, size_l; + + size_j = x; + size_k = y; + size_l = z; + + using MyRangePolicy = Kokkos::MDRangePolicy>; + using ViewRemote_3D_t = Kokkos::View; + using Local3D_t = typename ViewRemote_3D_t::HostMirror; + + ViewRemote_3D_t view = ViewRemote_3D_t("RemoteView", size_j, size_k, size_l); + auto local_range = Kokkos::Experimental::get_local_range(size_j); + Local3D_t v_H("HostView", view.extent(0), size_k, size_l); + + MyRangePolicy local_md_range( + {0, 0, 0}, {local_range.second - local_range.first, size_k, size_l}); + + Kokkos::parallel_for( + "Init", local_md_range, + KOKKOS_LAMBDA(const int j, const int k, const int l) { + v_H(j, k, l) = value(j, k, l, my_rank, local_range); + }); + Kokkos::fence(); + Kokkos::deep_copy(view, v_H); + RemoteSpace_t::fence(); + + auto remote_range = + Kokkos::Experimental::get_range(size_j, (my_rank + 1) % num_ranks); + MyRangePolicy remote_md_range({remote_range.first, 0, 0}, + {remote_range.second, size_k, size_l}); + + Kokkos::parallel_reduce( + "Remote Access via View", remote_md_range, + KOKKOS_LAMBDA(const int j, const int k, const int l, int &tmp) { + tmp += view(j, k, l); + int val = view(j, k, l); + }, + res); + Kokkos::fence(); + RemoteSpace_t::fence(); + + int start = value(0, 0, 0, next_rank, remote_range); + int end = value((remote_range.second - remote_range.first) - 1, (size_k - 1), + (size_l - 1), next_rank, remote_range); + + SUM(start, end, res_ref); + ASSERT_EQ(res, res_ref); + Kokkos::fence(); +} + +template +is_Layout_PL(Layout) test_mdrangepolicy(int x, int y, int z) { + // tbd +} + +#define GEN_BLOCK(Type, Layout, Space) \ + test_mdrangepolicy(4, 5, 6); \ + test_mdrangepolicy(1, 2, 10); \ + test_mdrangepolicy(5, 2, 2); + +TEST(TEST_CATEGORY, test_mdrangepolicy) { + using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; + using PLL_t = Kokkos::PartitionedLayoutLeft; + using PLR_t = Kokkos::PartitionedLayoutRight; + using LL_t = Kokkos::LayoutLeft; + using LR_t = Kokkos::LayoutRight; + + GEN_BLOCK(int, PLL_t, RemoteSpace_t) + GEN_BLOCK(int, PLR_t, RemoteSpace_t) + GEN_BLOCK(double, PLL_t, RemoteSpace_t) + GEN_BLOCK(double, PLR_t, RemoteSpace_t) + + GEN_BLOCK(int, LL_t, RemoteSpace_t) + GEN_BLOCK(int, LR_t, RemoteSpace_t) + GEN_BLOCK(double, LL_t, RemoteSpace_t) + GEN_BLOCK(double, LL_t, RemoteSpace_t) + + RemoteSpace_t::fence(); +} diff --git a/unit_tests/Test_ViewInit.cpp b/unit_tests/Test_ViewInit.cpp new file mode 100644 index 0000000..0db3a2d --- /dev/null +++ b/unit_tests/Test_ViewInit.cpp @@ -0,0 +1,92 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; + +#define is_Layout_PL(Layout) \ + std::enable_if_t::value> +#define is_Layout_GL(Layout) \ + std::enable_if_t::value> +#define ZERO DataType(0) +#define ONE DataType(1) + +template +is_Layout_PL(Layout) test_viewinit(Args... args) { + int numRanks; + MPI_Comm_size(MPI_COMM_WORLD, &numRanks); + int err_out; + + using RemoteView_t = Kokkos::View; + RemoteView_t view("MyRemoteView", numRanks, args...); + RemoteSpace_t().fence(); + + Kokkos::parallel_reduce( + "Check zero", view.span(), + KOKKOS_LAMBDA(int i, int& err) { + err += view.data()[i] == ZERO ? ZERO : ONE; + }, + err_out); + + ASSERT_EQ(err_out, ZERO); +} + +template +is_Layout_GL(Layout) test_viewinit(Args... args) { + int err_out; + using RemoteView_t = Kokkos::View; + RemoteView_t view("MyRemoteView", args...); + RemoteSpace_t().fence(); + + Kokkos::parallel_reduce( + "Check zero", view.span(), + KOKKOS_LAMBDA(int i, int& err) { + err += view.data()[i] == ZERO ? ZERO : ONE; + }, + err_out); + + ASSERT_EQ(err_out, ZERO); +} + +#define GENBLOCK(TYPE, LAYOUT, SPACE) \ + test_viewinit(1); \ + test_viewinit(4567); \ + test_viewinit(45617); \ + test_viewinit(1, 3); \ + test_viewinit(23, 12); \ + test_viewinit(1, 5617); + +TEST(TEST_CATEGORY, test_viewinit) { + using PLL_t = Kokkos::PartitionedLayoutLeft; + using PLR_t = Kokkos::PartitionedLayoutRight; + using LL_t = Kokkos::LayoutLeft; + using LR_t = Kokkos::LayoutRight; + + GENBLOCK(int, PLL_t, RemoteSpace_t) + GENBLOCK(int, PLR_t, RemoteSpace_t) + GENBLOCK(double, PLL_t, RemoteSpace_t) + GENBLOCK(double, PLR_t, RemoteSpace_t) + GENBLOCK(int, LL_t, RemoteSpace_t) + GENBLOCK(int, LR_t, RemoteSpace_t) + GENBLOCK(double, LL_t, RemoteSpace_t) + GENBLOCK(double, LR_t, RemoteSpace_t) + + RemoteSpace_t::fence(); +}