Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-with-kokkos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
env:
Kokkos_ROOT: /opt/kokkos
VARIORUM_ROOT: /opt/variorum
ROCM_PATH: /opt/rocm
steps:
- name: Checkout Kokkos Tools
uses: actions/checkout@v4
Expand Down
37 changes: 37 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,43 @@ option(KokkosTools_ENABLE_SYSTEMTAP "Enable SystemTap support" OFF)
option(KokkosTools_ENABLE_EXAMPLES "Build examples" OFF)
option(KokkosTools_ENABLE_TESTS "Build tests" OFF)

# Configure CMAKE_PREFIX_PATH for ROCm/HIP if ROCM_PATH is set
# This is needed for find_package(Kokkos) when Kokkos was built with HIP support
set(ROCM_PATH_TO_USE "")
if(DEFINED ENV{ROCM_PATH} AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
set(ROCM_PATH_TO_USE "$ENV{ROCM_PATH}")
elseif(EXISTS "/opt/rocm")
# Fallback to default ROCm installation path
set(ROCM_PATH_TO_USE "/opt/rocm")
message(STATUS "ROCM_PATH not set, using default: /opt/rocm")
endif()

if(NOT "${ROCM_PATH_TO_USE}" STREQUAL "")
message(STATUS "Configuring ROCm paths from: ${ROCM_PATH_TO_USE}")

# Add multiple potential locations for ROCm CMake configs
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}")
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/lib/cmake")
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/hip/lib/cmake")
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/lib/cmake/hip")

# Also set hip_DIR as a hint for find_package(hip) if not already set
# Try multiple potential locations
if(NOT DEFINED hip_DIR)
if(EXISTS "${ROCM_PATH_TO_USE}/lib/cmake/hip")
set(hip_DIR "${ROCM_PATH_TO_USE}/lib/cmake/hip" CACHE PATH "Path to hip CMake config")
message(STATUS "Setting hip_DIR hint: ${hip_DIR}")
elseif(EXISTS "${ROCM_PATH_TO_USE}/hip/lib/cmake/hip")
set(hip_DIR "${ROCM_PATH_TO_USE}/hip/lib/cmake/hip" CACHE PATH "Path to hip CMake config")
message(STATUS "Setting hip_DIR hint: ${hip_DIR}")
else()
message(STATUS "hip CMake config not found in standard locations under ${ROCM_PATH_TO_USE}")
endif()
endif()

message(STATUS "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}")
endif()

# Fetch Kokkos options:
acquire_kokkos_config()
if(DEFINED Kokkos_FOUND_MSG)
Expand Down
40 changes: 23 additions & 17 deletions profiling/nvtx-connector/kp_nvtx_connector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ void kokkosp_stop_profile_section(const uint32_t sID) {
nvtxRangeEnd(section.id);
}

void kokkosp_destroy_profile_section(const uint32_t sID) {
// NVTX ranges are automatically managed, no explicit destroy needed
}

void kokkosp_profile_event(const char* name) { nvtxMarkA(name); }

void kokkosp_begin_fence(const char* name, const uint32_t deviceId,
Expand Down Expand Up @@ -147,23 +151,24 @@ Kokkos::Tools::Experimental::EventSet get_event_set() {
Kokkos::Tools::Experimental::EventSet my_event_set;
memset(&my_event_set, 0,
sizeof(my_event_set)); // zero any pointers not set here
my_event_set.request_tool_settings = kokkosp_request_tool_settings;
my_event_set.init = kokkosp_init_library;
my_event_set.finalize = kokkosp_finalize_library;
my_event_set.push_region = kokkosp_push_profile_region;
my_event_set.pop_region = kokkosp_pop_profile_region;
my_event_set.begin_parallel_for = kokkosp_begin_parallel_for;
my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan;
my_event_set.end_parallel_for = kokkosp_end_parallel_for;
my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce;
my_event_set.end_parallel_scan = kokkosp_end_parallel_scan;
my_event_set.create_profile_section = kokkosp_create_profile_section;
my_event_set.start_profile_section = kokkosp_start_profile_section;
my_event_set.stop_profile_section = kokkosp_stop_profile_section;
my_event_set.profile_event = kokkosp_profile_event;
my_event_set.begin_fence = kokkosp_begin_fence;
my_event_set.end_fence = kokkosp_end_fence;
my_event_set.request_tool_settings = kokkosp_request_tool_settings;
my_event_set.init = kokkosp_init_library;
my_event_set.finalize = kokkosp_finalize_library;
my_event_set.push_region = kokkosp_push_profile_region;
my_event_set.pop_region = kokkosp_pop_profile_region;
my_event_set.begin_parallel_for = kokkosp_begin_parallel_for;
my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan;
my_event_set.end_parallel_for = kokkosp_end_parallel_for;
my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce;
my_event_set.end_parallel_scan = kokkosp_end_parallel_scan;
my_event_set.create_profile_section = kokkosp_create_profile_section;
my_event_set.start_profile_section = kokkosp_start_profile_section;
my_event_set.stop_profile_section = kokkosp_stop_profile_section;
my_event_set.destroy_profile_section = kokkosp_destroy_profile_section;
my_event_set.profile_event = kokkosp_profile_event;
my_event_set.begin_fence = kokkosp_begin_fence;
my_event_set.end_fence = kokkosp_end_fence;
return my_event_set;
}

Expand All @@ -188,6 +193,7 @@ EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section)
EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section)
EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section)
EXPOSE_DESTROY_PROFILE_SECTION(impl::kokkosp_destroy_profile_section)
EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event);
EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence);
EXPOSE_END_FENCE(impl::kokkosp_end_fence);
Expand Down
116 changes: 108 additions & 8 deletions profiling/vtune-connector/kp_vtune_connector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,42 @@
#include <cstring>
#include <unordered_map>
#include <string>
#include <vector>
#include <stack>

#include "kp_core.hpp"
#include "kp_vtune_connector_domain.h"

namespace {
struct Section {
std::string label;
__itt_domain* domain;
};
std::vector<Section> kokkosp_sections;
std::stack<__itt_domain*> kokkosp_region_stack;
} // namespace

namespace KokkosTools {
namespace VTuneConnector {

static KernelVTuneConnectorInfo* currentKernel;
static std::unordered_map<std::string, KernelVTuneConnectorInfo*> domain_map;
static uint64_t nextKernelID;
static bool tool_globfences = false;

void kokkosp_request_tool_settings(const uint32_t,
Kokkos_Tools_ToolSettings* settings) {
settings->requires_global_fencing = tool_globfences;
}

void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
const uint32_t devInfoCount,
Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
const char* tool_global_fences = getenv("KOKKOS_TOOLS_GLOBALFENCES");
if (NULL != tool_global_fences) {
tool_globfences = (atoi(tool_global_fences) != 0);
}

printf("-----------------------------------------------------------\n");
printf("KokkosP: VTune Analyzer Connector (sequence is %d, version: %llu)\n",
loadSeq, interfaceVer);
Expand Down Expand Up @@ -129,18 +151,86 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) {
currentKernel = NULL;
}

void kokkosp_push_profile_region(const char* name) {
__itt_domain* domain = __itt_domain_create(name);
domain->flags = 1;
kokkosp_region_stack.push(domain);
__itt_frame_begin_v3(domain, NULL);
}

void kokkosp_pop_profile_region() {
if (!kokkosp_region_stack.empty()) {
__itt_domain* domain = kokkosp_region_stack.top();
kokkosp_region_stack.pop();
__itt_frame_end_v3(domain, NULL);
}
}

void kokkosp_create_profile_section(const char* name, uint32_t* sID) {
*sID = kokkosp_sections.size();
__itt_domain* domain = __itt_domain_create(name);
domain->flags = 1;
kokkosp_sections.push_back({std::string(name), domain});
}

void kokkosp_start_profile_section(const uint32_t sID) {
if (sID >= kokkosp_sections.size()) return;
auto& section = kokkosp_sections[sID];
__itt_frame_begin_v3(section.domain, NULL);
}

void kokkosp_stop_profile_section(const uint32_t sID) {
if (sID >= kokkosp_sections.size()) return;
auto const& section = kokkosp_sections[sID];
__itt_frame_end_v3(section.domain, NULL);
}

void kokkosp_destroy_profile_section(const uint32_t sID) {
// VTune domains are not explicitly destroyed
}

void kokkosp_profile_event(const char* name) {
__itt_event event = __itt_event_create(name, strlen(name));
__itt_event_start(event);
}

void kokkosp_begin_fence(const char* name, const uint32_t /*deviceId*/,
uint64_t* handle) {
__itt_domain* domain = __itt_domain_create(name);
domain->flags = 1;
__itt_frame_begin_v3(domain, NULL);
// Store domain in handle for use in end_fence
// This is not ideal but VTune API doesn't provide a better way
*handle = reinterpret_cast<uint64_t>(domain);
}

void kokkosp_end_fence(uint64_t handle) {
__itt_domain* domain = reinterpret_cast<__itt_domain*>(handle);
__itt_frame_end_v3(domain, NULL);
}

Kokkos::Tools::Experimental::EventSet get_event_set() {
Kokkos::Tools::Experimental::EventSet my_event_set;
memset(&my_event_set, 0,
sizeof(my_event_set)); // zero any pointers not set here
my_event_set.init = kokkosp_init_library;
my_event_set.finalize = kokkosp_finalize_library;
my_event_set.begin_parallel_for = kokkosp_begin_parallel_for;
my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan;
my_event_set.end_parallel_for = kokkosp_end_parallel_for;
my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce;
my_event_set.end_parallel_scan = kokkosp_end_parallel_scan;
my_event_set.request_tool_settings = kokkosp_request_tool_settings;
my_event_set.init = kokkosp_init_library;
my_event_set.finalize = kokkosp_finalize_library;
my_event_set.push_region = kokkosp_push_profile_region;
my_event_set.pop_region = kokkosp_pop_profile_region;
my_event_set.begin_parallel_for = kokkosp_begin_parallel_for;
my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan;
my_event_set.end_parallel_for = kokkosp_end_parallel_for;
my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce;
my_event_set.end_parallel_scan = kokkosp_end_parallel_scan;
my_event_set.create_profile_section = kokkosp_create_profile_section;
my_event_set.start_profile_section = kokkosp_start_profile_section;
my_event_set.stop_profile_section = kokkosp_stop_profile_section;
my_event_set.destroy_profile_section = kokkosp_destroy_profile_section;
my_event_set.profile_event = kokkosp_profile_event;
my_event_set.begin_fence = kokkosp_begin_fence;
my_event_set.end_fence = kokkosp_end_fence;
return my_event_set;
}

Expand All @@ -151,13 +241,23 @@ extern "C" {

namespace impl = KokkosTools::VTuneConnector;

EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings)
EXPOSE_INIT(impl::kokkosp_init_library)
EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section)
EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section)
EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section)
EXPOSE_DESTROY_PROFILE_SECTION(impl::kokkosp_destroy_profile_section)
EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event);
EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence);
EXPOSE_END_FENCE(impl::kokkosp_end_fence);

} // extern "C"
Loading