diff --git a/hipamd/hip-config-amd.cmake b/hipamd/hip-config-amd.cmake index b27029be8c..00f05dc9c9 100755 --- a/hipamd/hip-config-amd.cmake +++ b/hipamd/hip-config-amd.cmake @@ -178,6 +178,12 @@ if( CLANGRT_Error ) else() # Add support for __fp16 and _Float16, explicitly link with compiler-rt if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" ) + # The HIP_CXX_COMPILER by default prefers backward slashes for path seperators on windows. + # Prefer forward slashes here to avoid escaping issues on certain build systems. + if(WIN32) + string(REPLACE "\\" "/" CLANGRT_BUILTINS ${CLANGRT_BUILTINS}) + endif() + # CLANG_RT Builtins found Successfully Set interface link libraries property set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") diff --git a/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/hipamd/include/hip/amd_detail/amd_hip_atomic.h index c02a57b079..e35a79abdc 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_atomic.h +++ b/hipamd/include/hip/amd_detail/amd_hip_atomic.h @@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) > val) { + while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) > val) { + while (!done && + (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) < val) { + while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) < val) { + while (!done && + (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index 64c2740132..98f8896cd9 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -103,14 +103,16 @@ unsigned long long int __ballot64(int predicate) { return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); } +// See amd_warp_sync_functions.h for an explanation of this preprocessor flag. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS // Since threads in a wave do not make independent progress, __activemask() // always returns the exact active mask, i.e, all active threads in the wave. - __device__ inline unsigned long long __activemask() { return __ballot(true); } +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS __device__ static inline unsigned int __lane_id() { return __builtin_amdgcn_mbcnt_hi( diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 5ce2581a8d..b8c67a8972 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -22,6 +22,15 @@ THE SOFTWARE. #pragma once +// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.1 as a +// preview to allow end-users to adapt to the new interface involving 64-bit +// masks. These are disabled by default, and can be enabled by setting the macro +// below. The builtins will be enabled unconditionally in ROCm 6.2. +// +// This arrangement also applies to the __activemask() builtin defined in +// amd_warp_functions.h. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS + #if !defined(__HIPCC_RTC__) #include "amd_warp_functions.h" #include "hip_assert.h" @@ -259,3 +268,5 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask, #undef __hip_do_sync #undef __hip_check_mask + +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp index 7bae717bf5..21f37258da 100644 --- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp +++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp @@ -936,6 +936,7 @@ typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraph typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t *pGraphNode, hipGraph_t graph, const hipGraphNode_t *pDependencies, size_t numDependencies, hipGraphNodeParams *nodeParams); +typedef hipError_t (*t_hipExtGetLastError)(); // HIP Compiler dispatch table struct HipCompilerDispatchTable { @@ -1396,4 +1397,5 @@ struct HipDispatchTable { t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn; t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn; t_hipGraphAddNode hipGraphAddNode_fn; + t_hipExtGetLastError hipExtGetLastError_fn; }; diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt index b038fd72dc..b2bf28bbaa 100644 --- a/hipamd/src/CMakeLists.txt +++ b/hipamd/src/CMakeLists.txt @@ -59,6 +59,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") if(BUILD_SHARED_LIBS) add_library(amdhip64 SHARED) + if(WIN32) + set_target_properties(amdhip64 PROPERTIES RUNTIME_OUTPUT_NAME "amdhip64_${HIP_VERSION_MAJOR}") + endif() # Windows doesn't have a strip utility, so CMAKE_STRIP won't be set. if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL "")) add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $) diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def index 2133815d86..f7ac5df298 100644 --- a/hipamd/src/amdhip.def +++ b/hipamd/src/amdhip.def @@ -427,7 +427,6 @@ hipMemcpy2DAsync_spt hipMemcpyFromSymbolAsync_spt hipMemcpyToSymbolAsync_spt hipMemcpyFromArray_spt -hipMemcpy2DToArray_spt hipMemcpy2DFromArrayAsync_spt hipMemcpy2DToArrayAsync_spt hipDrvGetErrorName diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp index 2ec7ab86d7..c06682fbdd 100644 --- a/hipamd/src/hip_api_trace.cpp +++ b/hipamd/src/hip_api_trace.cpp @@ -756,6 +756,7 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams( hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams); hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags); +hipError_t hipExtGetLastError(); } // namespace hip namespace hip { @@ -808,6 +809,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipCtxSetSharedMemConfig_fn = hip::hipCtxSetSharedMemConfig; ptrDispatchTable->hipCtxSynchronize_fn = hip::hipCtxSynchronize; ptrDispatchTable->hipDestroyExternalMemory_fn = hip::hipDestroyExternalMemory; + ptrDispatchTable->hipDestroyExternalSemaphore_fn = hip::hipDestroyExternalSemaphore; ptrDispatchTable->hipDestroySurfaceObject_fn = hip::hipDestroySurfaceObject; ptrDispatchTable->hipDestroyTextureObject_fn = hip::hipDestroyTextureObject; ptrDispatchTable->hipDeviceCanAccessPeer_fn = hip::hipDeviceCanAccessPeer; @@ -1224,6 +1226,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipGetStreamDeviceId_fn = hip::hipGetStreamDeviceId; ptrDispatchTable->hipDrvGraphAddMemsetNode_fn = hip::hipDrvGraphAddMemsetNode; ptrDispatchTable->hipGetDevicePropertiesR0000_fn = hip::hipGetDevicePropertiesR0000; + ptrDispatchTable->hipExtGetLastError_fn = hip::hipExtGetLastError; } #if HIP_ROCPROFILER_REGISTER > 0 diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp index 47569f0669..4b7f5c8098 100644 --- a/hipamd/src/hip_context.cpp +++ b/hipamd/src/hip_context.cpp @@ -90,7 +90,7 @@ void setCurrentDevice(unsigned int index) { hip::Stream* getStream(hipStream_t stream, bool wait) { if (stream == nullptr) { - return getNullStream(); + return getNullStream(wait); } else { hip::Stream* hip_stream = reinterpret_cast(stream); if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) { @@ -128,9 +128,9 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -hip::Stream* getNullStream() { +hip::Stream* getNullStream(bool wait ) { Device* device = getCurrentDevice(); - return device ? device->NullStream() : nullptr; + return device ? device->NullStream(wait) : nullptr; } hipError_t hipInit(unsigned int flags) { diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 5657fd6371..bcf6830f12 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -30,7 +30,7 @@ namespace hip { // ================================================================================================ -hip::Stream* Device::NullStream() { +hip::Stream* Device::NullStream(bool wait) { if (null_stream_ == nullptr) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } @@ -38,8 +38,10 @@ hip::Stream* Device::NullStream() { if (null_stream_ == nullptr) { return nullptr; } - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); + if (wait == true) { + // Wait for all active streams before executing commands on the default + iHipWaitActiveStreams(null_stream_); + } return null_stream_; } @@ -463,6 +465,8 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, int device) { deviceProps.timelineSemaphoreInteropSupported = 0; deviceProps.unifiedFunctionPointers = 0; + deviceProps.integrated = info.accelerator_; + *props = deviceProps; return hipSuccess; } diff --git a/hipamd/src/hip_embed_pch.sh b/hipamd/src/hip_embed_pch.sh index 2d998407ae..9b01da8726 100755 --- a/hipamd/src/hip_embed_pch.sh +++ b/hipamd/src/hip_embed_pch.sh @@ -203,7 +203,7 @@ __hipRTC_header_size: EOF set -x - $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc && + $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -P -o $tmp/hiprtc && cat $macroFile >> $tmp/hiprtc && $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj && $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared && diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp index d233e42438..f527766e4b 100644 --- a/hipamd/src/hip_event.cpp +++ b/hipamd/src/hip_event.cpp @@ -85,8 +85,6 @@ hipError_t Event::synchronize() { event_->awaitCompletion(); } } - // Release freed memory for all memory pools on the device - hip_device->ReleaseFreedMemory(); return hipSuccess; } @@ -443,7 +441,12 @@ hipError_t hipEventSynchronize(hipEvent_t event) { if (hip::Stream::StreamCaptureOngoing(e->GetCaptureStream()) == true) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } - HIP_RETURN(e->synchronize()); + + hipError_t status = e->synchronize(); + // Release freed memory for all memory pools on the device + g_devices[e->deviceId()]->ReleaseFreedMemory(); + + HIP_RETURN(status); } hipError_t ihipEventQuery(hipEvent_t event) { diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp index 9d5591d053..ac8775597d 100644 --- a/hipamd/src/hip_fatbin.cpp +++ b/hipamd/src/hip_fatbin.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -115,10 +115,9 @@ void ListAllDeviceWithNoCOFromBundle(const std::unordered_map& devices) { - amd_comgr_data_t data_object; + amd_comgr_data_t data_object {0}; amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS; hipError_t hip_status = hipSuccess; - amd_comgr_code_object_info_t* query_list_array = nullptr; // If image was passed as a pointer to our hipMod* api, we can try to extract the file name // if it was mapped by the app. Otherwise use the COMGR data API. @@ -266,12 +265,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectordeviceId()]->program_ = new amd::Program(*(device->asContext())); } - - if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) { - LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status); - return hipErrorInvalidValue; - } - } while(0); // Clean up file and memory resouces if hip_status failed for some reason. @@ -292,7 +285,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector g_captureStreams; amd::Monitor g_captureStreamsLock{"StreamCaptureGlobalList"}; amd::Monitor g_streamSetLock{"StreamCaptureset"}; std::unordered_set g_allCapturingStreams; +hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags); inline hipError_t ihipGraphAddNode(hip::GraphNode* graphNode, hip::Graph* graph, hip::GraphNode* const* pDependencies, size_t numDependencies, @@ -1223,6 +1224,16 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph, flags); if (*pGraphExec != nullptr) { graph->SetGraphInstantiated(true); + if (DEBUG_HIP_GRAPH_DOT_PRINT) { + static int i = 1; + std::string filename = + "graph_" + std::to_string(amd::Os::getProcessId()) + "_dot_print_" + std::to_string(i++); + hipError_t status = + ihipGraphDebugDotPrint(reinterpret_cast(graph), filename.c_str(), 0); + if (status == hipSuccess) { + LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str()); + } + } return (*pGraphExec)->Init(); } else { return hipErrorOutOfMemory; @@ -1541,7 +1552,15 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo if (clonedNode == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN(reinterpret_cast(clonedNode)->SetParams(pNodeParams)); + hipError_t status = reinterpret_cast(clonedNode)->SetParams(pNodeParams); + if(status != hipSuccess) { + HIP_RETURN(status); + } + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + status = reinterpret_cast(hGraphExec) + ->UpdateAQLPacket(reinterpret_cast(clonedNode)); + } + HIP_RETURN(status); } hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) { @@ -2550,13 +2569,10 @@ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t } hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) { - if (graph == nullptr || path == nullptr) { - return hipErrorInvalidValue; - } std::ofstream fout; fout.open(path, std::ios::out); if (fout.fail()) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Error during opening of file : %s", path); + LogPrintfError("[hipGraph] Error during opening of file : %s", path); return hipErrorOperatingSystem; } fout << "digraph dot {" << std::endl; @@ -2568,6 +2584,9 @@ hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned i hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) { HIP_INIT_API(hipGraphDebugDotPrint, graph, path, flags); + if (graph == nullptr || path == nullptr) { + return hipErrorInvalidValue; + } HIP_RETURN(ihipGraphDebugDotPrint(graph, path, flags)); } diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index ee70f8b1b9..f3a5be0880 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -332,6 +332,9 @@ hipError_t GraphExec::CreateStreams(uint32_t num_streams) { } parallel_streams_.push_back(stream); } + // Don't wait for other streams to finish. + // Capture stream is to capture AQL packet. + capture_stream_ = hip::getNullStream(false); return hipSuccess; } @@ -353,13 +356,10 @@ hipError_t GraphExec::CaptureAQLPackets() { hipError_t status = hipSuccess; if (parallelLists_.size() == 1) { size_t kernArgSizeForGraph = 0; - hip::Stream* stream = nullptr; // GPU packet capture is enabled for kernel nodes. Calculate the kernel // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists_) { - stream = GetAvailableStreams(); for (auto& node : list) { - node->SetStream(stream, this); if (node->GetType() == hipGraphNodeTypeKernel) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } @@ -386,7 +386,6 @@ hipError_t GraphExec::CaptureAQLPackets() { for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeKernel) { auto kernelNode = reinterpret_cast(node); - status = node->CreateCommand(node->GetQueue()); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = allocKernArg(kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment()); @@ -394,7 +393,7 @@ hipError_t GraphExec::CaptureAQLPackets() { return hipErrorMemoryAllocation; } // Form GPU packet capture for the kernel node. - kernelNode->CaptureAndFormPacket(kernArgOffset); + kernelNode->CaptureAndFormPacket(capture_stream_, kernArgOffset) ; } } @@ -408,7 +407,7 @@ hipError_t GraphExec::CaptureAQLPackets() { address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int); *dev_ptr = host_val; if (device->info().hdpMemFlushCntl == nullptr) { - amd::Command* command = new amd::Marker(*stream, true); + amd::Command* command = new amd::Marker(*capture_stream_, true); if (command != nullptr) { command->enqueue(); command->release(); @@ -426,6 +425,50 @@ hipError_t GraphExec::CaptureAQLPackets() { return status; } +hipError_t GraphExec::UpdateAQLPacket(hip::GraphKernelNode* node) { + if (parallelLists_.size() == 1) { + size_t pool_new_usage = 0; + address result = nullptr; + if (!kernarg_graph_.empty()) { + // 1. Allocate memory for the kernel args + size_t kernArgSizeForNode = 0; + kernArgSizeForNode = node->GetKerArgSize(); + + result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_, + node->GetKernargSegmentAlignment()); + pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back(); + } + if (pool_new_usage != 0 && pool_new_usage <= kernarg_graph_size_) { + kernarg_graph_cur_offset_ = pool_new_usage; + } else { + address kernarg_graph; + auto device = g_devices[ihipGetDevice()]->devices()[0]; + if (device->info().largeBar_) { + kernarg_graph = reinterpret_cast
(device->deviceLocalAlloc(kernarg_graph_size_)); + } else { + kernarg_graph = reinterpret_cast
( + device->hostAlloc(kernarg_graph_size_, 0, amd::Device::MemorySegment::kKernArg)); + } + kernarg_graph_.push_back(kernarg_graph); + kernarg_graph_cur_offset_ = 0; + + // 1. Allocate memory for the kernel args + size_t kernArgSizeForNode = 0; + kernArgSizeForNode = node->GetKerArgSize(); + result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_, + node->GetKernargSegmentAlignment()); + const size_t pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back(); + if (pool_new_usage <= kernarg_graph_size_) { + kernarg_graph_cur_offset_ = pool_new_usage; + } + } + + // 2. copy kernel args / create new AQL packet + node->CaptureAndFormPacket(capture_stream_, result); + } + return hipSuccess; +} + hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& topoOrder, Graph* clonedGraph, diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 7f6543cb91..2229b75d8c 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -543,7 +543,7 @@ struct Graph { graphInstantiated_ = graphInstantiate; } }; - +struct GraphKernelNode; struct GraphExec { std::vector> parallelLists_; // Topological order of the graph doesn't include nodes embedded as part of the child graph @@ -551,6 +551,7 @@ struct GraphExec { std::unordered_map> nodeWaitLists_; struct Graph* clonedGraph_; std::vector parallel_streams_; + hip::Stream* capture_stream_; uint currentQueueIndex_; std::unordered_map clonedNodes_; amd::Command* lastEnqueuedCommand_; @@ -563,6 +564,10 @@ struct GraphExec { address kernarg_pool_graph_ = nullptr; uint32_t kernarg_pool_size_graph_ = 0; uint32_t kernarg_pool_cur_graph_offset_ = 0; + std::vector
kernarg_graph_; + uint32_t kernarg_graph_cur_offset_ = 0; + uint32_t kernarg_graph_size_ = 128 * Ki; + public: GraphExec(std::vector& topoOrder, std::vector>& lists, std::unordered_map>& nodeWaitLists, struct Graph*& clonedGraph, @@ -591,6 +596,9 @@ struct GraphExec { auto device = g_devices[ihipGetDevice()]->devices()[0]; if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { device->hostFree(kernarg_pool_graph_, kernarg_pool_size_graph_); + for (auto& element : kernarg_graph_) { + device->hostFree(element, kernarg_graph_size_); + } } amd::ScopedLock lock(graphExecSetLock_); graphExecSet_.erase(this); @@ -636,6 +644,7 @@ struct GraphExec { hipError_t Run(hipStream_t stream); // Capture GPU Packets from graph commands hipError_t CaptureAQLPackets(); + hipError_t UpdateAQLPacket(hip::GraphKernelNode* node); }; struct ChildGraphNode : public GraphNode { @@ -793,19 +802,20 @@ class GraphKernelNode : public GraphNode { out << "];"; } - void CaptureAndFormPacket(address kernArgOffset) { - for (auto& command : commands_) { - reinterpret_cast(command)->setCapturingState( - true, GetAqlPacket(), kernArgOffset); - - // Enqueue command to capture GPU Packet. The packet is not submitted to the device. - // The packet is stored in gpuPacket_ and submitted during graph launch. - command->submit(*(command->queue())->vdev()); - // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes - SetKernelName(reinterpret_cast(command)->kernel().name()); - command->release(); - } + void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) { + hipError_t status = CreateCommand(capture_stream); + for (auto& command : commands_) { + reinterpret_cast(command)->setCapturingState( + true, GetAqlPacket(), kernArgOffset); + + // Enqueue command to capture GPU Packet. The packet is not submitted to the device. + // The packet is stored in gpuPacket_ and submitted during graph launch. + command->submit(*(command->queue())->vdev()); + // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes + SetKernelName(reinterpret_cast(command)->kernel().name()); + command->release(); } + } std::string GetLabel(hipGraphDebugDotFlags flag) { hipFunction_t func = getFunc(kernelParams_, ihipGetDevice()); diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in index b8f0b3da64..fc7d91e8bc 100644 --- a/hipamd/src/hip_hcc.map.in +++ b/hipamd/src/hip_hcc.map.in @@ -486,7 +486,6 @@ global: hipMemcpyFromSymbolAsync_spt; hipMemcpyToSymbolAsync_spt; hipMemcpyFromArray_spt; - hipMemcpy2DToArray_spt; hipMemcpy2DFromArrayAsync_spt; hipMemcpy2DToArrayAsync_spt; hipDrvGetErrorName; diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp index 88b12d8451..d3d38178fa 100644 --- a/hipamd/src/hip_internal.hpp +++ b/hipamd/src/hip_internal.hpp @@ -487,7 +487,7 @@ class stream_per_thread { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - hip::Stream* NullStream(); + hip::Stream* NullStream(bool wait = true); Stream* GetNullStream() const {return null_stream_;}; void SetActiveStatus() { @@ -572,7 +572,7 @@ class stream_per_thread { /// Get default stream associated with the ROCclr context extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern hip::Stream* getNullStream(); + extern hip::Stream* getNullStream(bool wait = true); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 64ed22985c..8dc0e176bd 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1311,8 +1311,8 @@ hipError_t hipMemcpyToSymbol_common(const void* symbol, const void* src, size_t size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); - if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { HIP_RETURN(hipErrorInvalidMemcpyDirection); } @@ -1345,8 +1345,8 @@ hipError_t hipMemcpyFromSymbol_common(void* dst, const void* symbol, size_t size size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); - if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { HIP_RETURN(hipErrorInvalidMemcpyDirection); } @@ -1379,8 +1379,8 @@ hipError_t hipMemcpyToSymbolAsync_common(const void* symbol, const void* src, si size_t offset, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyToSymbolAsync, stream, symbol, src, sizeBytes, offset, kind); - if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { return hipErrorInvalidMemcpyDirection; } @@ -1412,8 +1412,8 @@ hipError_t hipMemcpyFromSymbolAsync_common(void* dst, const void* symbol, size_t size_t offset, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyFromSymbolAsync, stream, dst, symbol, sizeBytes, offset, kind); - if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { return hipErrorInvalidMemcpyDirection; } @@ -1820,28 +1820,43 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* amd::Memory* srcMemory; amd::BufferRect srcRect; amd::BufferRect dstRect; + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset); + hipError_t status = ihipMemcpyDtoHValidate(srcDevice, dstHost, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcMemory, srcRect, dstRect); if (status != hipSuccess) { return status; } + amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::ReadMemoryCommand* readCommand = + if (dstMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, + *srcMemory, *dstMemory, srcOrigin, dstOrigin, + copyRegion, srcRect, dstRect, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::ReadMemoryCommand* readCommand = new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect, copyMetadata); + if (readCommand == nullptr) { + return hipErrorOutOfMemory; + } - if (readCommand == nullptr) { - return hipErrorOutOfMemory; + if (!readCommand->validatePeerMemory()) { + delete readCommand; + return hipErrorInvalidValue; + } + command = readCommand; } - if (!readCommand->validatePeerMemory()) { - delete readCommand; - return hipErrorInvalidValue; - } - command = readCommand; return hipSuccess; } @@ -1884,6 +1899,8 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo amd::Memory* dstMemory; amd::BufferRect srcRect; amd::BufferRect dstRect; + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset); hipError_t status = ihipMemcpyHtoDValidate(srcHost, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, @@ -1891,21 +1908,33 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo if (status != hipSuccess) { return status; } + amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand( + if (srcMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, + *srcMemory, *dstMemory, srcOrigin, dstOrigin, + copyRegion, srcRect, dstRect, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::WriteMemoryCommand *writeCommand = new amd::WriteMemoryCommand( *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, copyRegion, srcHost, dstRect, srcRect, copyMetadata); + if (writeCommand == nullptr) { + return hipErrorOutOfMemory; + } - if (writeCommand == nullptr) { - return hipErrorOutOfMemory; + if (!writeCommand->validatePeerMemory()) { + delete writeCommand; + return hipErrorInvalidValue; + } + command = writeCommand; } - if (!writeCommand->validatePeerMemory()) { - delete writeCommand; - return hipErrorInvalidValue; - } - command = writeCommand; return hipSuccess; } @@ -2068,6 +2097,8 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi hip::Stream* stream, bool isAsync = false) { amd::Image* dstImage; size_t start = 0; //!< Start offset for the copy region + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset); hipError_t status = ihipMemcpyHtoAValidate(srcHost, dstArray, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstImage, start); @@ -2076,20 +2107,31 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi } amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( + if (srcMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{}, + *srcMemory, *dstImage, srcOrigin, dstOrigin, + copyRegion, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, copyRegion, static_cast(srcHost) + start, srcRowPitch, srcSlicePitch, copyMetadata); + if (writeMemCmd == nullptr) { + return hipErrorOutOfMemory; + } - if (writeMemCmd == nullptr) { - return hipErrorOutOfMemory; + if (!writeMemCmd->validatePeerMemory()) { + delete writeMemCmd; + return hipErrorInvalidValue; + } + command = writeMemCmd; } - if (!writeMemCmd->validatePeerMemory()) { - delete writeMemCmd; - return hipErrorInvalidValue; - } - command = writeMemCmd; return hipSuccess; } @@ -2127,8 +2169,9 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo hip::Stream* stream, bool isAsync = false) { amd::Image* srcImage; amd::BufferRect dstRect; - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); size_t start = 0; //!< Start offset for the copy region + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset); hipError_t status = ihipMemcpyAtoHValidate(srcArray, dstHost, srcOrigin, dstOrigin, copyRegion, dstRowPitch, dstSlicePitch, srcImage, start); @@ -2136,20 +2179,32 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo return status; } - amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + if (dstMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{}, + *srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, copyRegion, static_cast(dstHost) + start, dstRowPitch, dstSlicePitch, copyMetadata); - if (readMemCmd == nullptr) { - return hipErrorOutOfMemory; - } + if (readMemCmd == nullptr) { + return hipErrorOutOfMemory; + } - if (!readMemCmd->validatePeerMemory()) { - delete readMemCmd; - return hipErrorInvalidValue; + if (!readMemCmd->validatePeerMemory()) { + delete readMemCmd; + return hipErrorInvalidValue; + } + command = readMemCmd; } - command = readMemCmd; + return hipSuccess; } diff --git a/hipamd/src/hip_mempool_impl.cpp b/hipamd/src/hip_mempool_impl.cpp index 6971c0fa9c..3afe065bb0 100644 --- a/hipamd/src/hip_mempool_impl.cpp +++ b/hipamd/src/hip_mempool_impl.cpp @@ -44,18 +44,21 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) { amd::Memory* Heap::FindMemory(size_t size, hip::Stream* stream, bool opportunistic, void* dptr) { amd::Memory* memory = nullptr; auto start = allocations_.lower_bound({size, nullptr}); - // Runtime can accept an allocation with 12.5% on the size threshold - uint32_t i = 0; - for (auto it = start; (it != allocations_.end()) && (it->first.first <= (size / 8.0) * 9);) { - i++; + for (auto it = start; it != allocations_.end();) { bool check_address = (dptr == nullptr); if (it->first.second->getSvmPtr() == dptr) { // If the search is done for the specified address then runtime must wait it->second.Wait(); check_address = true; } + // Runtime can accept an allocation with 12.5% on the size threshold + bool opp_mode = opportunistic; + if (it->first.first > (size / 8.0) * 9) { + // Disable opportunistic mode for more aggressive search + opp_mode = false; + } // Check if size can match and it's safe to use this resource. - if (check_address && (it->second.IsSafeFind(stream, opportunistic))) { + if (check_address && (it->second.IsSafeFind(stream, opp_mode))) { memory = it->first.second; total_size_ -= memory->getSize(); // Remove found allocation from the map @@ -219,6 +222,22 @@ void* MemoryPool::AllocateMemory(size_t size, hip::Stream* stream, void* dptr) { bool MemoryPool::FreeMemory(amd::Memory* memory, hip::Stream* stream) { amd::ScopedLock lock(lock_pool_ops_); + // If the free heap grows over the busy heap, then force release + if (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize()) { + // Use event base release to reduce memory pressure + constexpr size_t kBytesToHold = 0; + free_heap_.ReleaseAllMemory(kBytesToHold); + + // If free mmeory is less than 12.5% of total, then force wait release + size_t free = 0; + size_t total = 0; + hipError_t err = hipMemGetInfo(&free, &total); + if ((err == hipSuccess) && (free < (total >> 3))) { + constexpr bool kSafeRelease = true; + free_heap_.ReleaseAllMemory(free_heap_.GetTotalSize() >> 1, kSafeRelease); + } + } + MemoryTimestamp ts; // Remove memory object from the busy pool if (!busy_heap_.RemoveMemory(memory, &ts)) { @@ -327,6 +346,7 @@ hipError_t MemoryPool::SetAttribute(hipMemPoolAttr attr, void* value) { return hipErrorInvalidValue; } free_heap_.SetMaxTotalSize(reset); + busy_heap_.SetMaxTotalSize(reset); break; case hipMemPoolAttrUsedMemCurrent: // Should be GetAttribute only @@ -372,7 +392,8 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) { break; case hipMemPoolAttrReservedMemHigh: // High watermark of all allocated memory in OS, since the last reset - *reinterpret_cast(value) = busy_heap_.GetTotalSize() + free_heap_.GetMaxTotalSize(); + *reinterpret_cast(value) = busy_heap_.GetMaxTotalSize() + + free_heap_.GetMaxTotalSize(); break; case hipMemPoolAttrUsedMemCurrent: // Total currently used memory by the pool diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp index 8324e42371..81d681949e 100644 --- a/hipamd/src/hip_table_interface.cpp +++ b/hipamd/src/hip_table_interface.cpp @@ -1711,3 +1711,6 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD extern "C" int hipGetStreamDeviceId(hipStream_t stream) { return hip::GetHipDispatchTable()->hipGetStreamDeviceId_fn(stream); } +hipError_t hipExtGetLastError() { + return hip::GetHipDispatchTable()->hipExtGetLastError_fn(); +} diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp index 2335c426bc..031c001834 100644 --- a/hipamd/src/hip_vm.cpp +++ b/hipamd/src/hip_vm.cpp @@ -288,6 +288,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, HIP_RETURN(hipErrorInvalidValue) } + if (desc[desc_idx].flags == hipMemAccessFlagsProtRead) { + HIP_RETURN(hipErrorInvalidValue) + } + auto& dev = g_devices[desc[desc_idx].location.id]; amd::Device::VmmAccess access_flags = static_cast(desc[desc_idx].flags); diff --git a/hipamd/src/hiprtc/CMakeLists.txt b/hipamd/src/hiprtc/CMakeLists.txt index b1745a02f5..6565e25ecd 100644 --- a/hipamd/src/hiprtc/CMakeLists.txt +++ b/hipamd/src/hiprtc/CMakeLists.txt @@ -168,7 +168,7 @@ generate_hiprtc_mcin("${HIPRTC_GEN_MCIN}" "${HIPRTC_GEN_PREPROCESSED}") # Note: second command appends define macros at build time. add_custom_command( OUTPUT ${HIPRTC_GEN_PREPROCESSED} - COMMAND $ -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -o ${HIPRTC_GEN_PREPROCESSED} + COMMAND $ -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -P -o ${HIPRTC_GEN_PREPROCESSED} COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_HEADERS="${HIPRTC_HEADERS}" -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE} DEPENDS clang ${HIPRTC_GEN_HEADER}) add_custom_command( diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp index debecbe910..d213afe0f5 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp @@ -585,6 +585,87 @@ bool createAction(amd_comgr_action_info_t& action, std::vector& opt return AMD_COMGR_STATUS_SUCCESS; } +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, + std::vector& linkOptions, std::string& buildLog, + std::vector& exe) { + amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP; + amd_comgr_action_info_t action; + amd_comgr_data_set_t reloc; + amd_comgr_data_set_t output; + amd_comgr_data_set_t input = compileInputs; + + if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + return false; + } + + if (auto res = amd::Comgr::create_data_set(&reloc); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + return false; + } + + if (auto res = amd::Comgr::create_data_set(&output); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE, action, + input, reloc); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(reloc, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (!extractBuildLog(reloc, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + amd::Comgr::destroy_action_info(action); + if (auto res = createAction(action, linkOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action, + reloc, output); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(output, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractBuildLog(output, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractByteCodeBinary(output, AMD_COMGR_DATA_KIND_EXECUTABLE, exe)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + // Clean up + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return true; +} + bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode) { @@ -646,8 +727,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i return false; } - if (auto res = - amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); res != AMD_COMGR_STATUS_SUCCESS) { amd::Comgr::destroy_action_info(action); amd::Comgr::destroy_data_set(output); @@ -915,9 +995,9 @@ bool fillMangledNames(std::vector& dataVec, std::map(it.first.data()); + char* data = const_cast(it.first.data()); if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, NULL)) { amd::Comgr::release_data(dataObject); @@ -925,7 +1005,8 @@ bool fillMangledNames(std::vector& dataVec, std::map mName(new char[Size]()); - if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { + if (auto res = + amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { amd::Comgr::release_data(dataObject); return false; } diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp index d34c9264c6..05e1c013d0 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp @@ -41,6 +41,10 @@ bool extractByteCodeBinary(const amd_comgr_data_set_t inDataSet, bool createAction(amd_comgr_action_info_t& action, std::vector& options, const std::string& isa, const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_NONE); +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, + std::vector& linkOptions, std::string& buildLog, + std::vector& exe); bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode); @@ -54,8 +58,8 @@ bool dumpIsaFromBC(const amd_comgr_data_set_t isaInputs, const std::string& isa, std::vector& exeOptions, std::string name, std::string& buildLog); bool demangleName(const std::string& mangledName, std::string& demangledName); std::string handleMangledName(std::string loweredName); -bool fillMangledNames(std::vector& executable, std::map& mangledNames, - bool isBitcode); +bool fillMangledNames(std::vector& executable, + std::map& mangledNames, bool isBitcode); void GenerateUniqueFileName(std::string& name); } // namespace helpers } // namespace hiprtc diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index 46a034c828..ac02aa1070 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -59,7 +59,8 @@ RTCProgram::RTCProgram(std::string name) : name_(name) { bool RTCProgram::findIsa() { const char* libName; #ifdef _WIN32 - libName = "amdhip64.dll"; + std::string dll_name = std::string("amdhip64_" + std::to_string(HIP_VERSION_MAJOR) + ".dll"); + libName = dll_name.c_str(); #else libName = "libamdhip64.so"; #endif @@ -116,7 +117,6 @@ bool RTCProgram::findIsa() { // RTC Compile Program Member Functions void RTCProgram::AppendOptions(const std::string app_env_var, std::vector* options) { - if (options == nullptr) { LogError("Append options passed is nullptr."); return; @@ -171,8 +171,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.push_back("-fms-compatibility"); #endif AppendCompileOptions(); - - exe_options_.push_back("-O3"); } bool RTCCompileProgram::addSource(const std::string& source, const std::string& name) { @@ -261,10 +259,6 @@ bool RTCCompileProgram::transformOptions(std::vector& compile_optio i = "--offload-arch=" + val; continue; } - if (i == "--save-temps") { - settings_.dumpISA = true; - continue; - } } // Removed consumed options @@ -300,78 +294,28 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg compileOpts.reserve(compile_options_.size() + options.size() + 2); compileOpts.insert(compileOpts.end(), options.begin(), options.end()); - if (!fgpu_rdc_) { - compileOpts.push_back("-Xclang"); - compileOpts.push_back("-disable-llvm-passes"); - } - if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { - LogError("Error in hiprtc: unable to compile source to bitcode"); - return false; - } - - if (fgpu_rdc_ && !mangled_names_.empty()) { - if (!fillMangledNames(LLVMBitcode_, mangled_names_, true)) { - LogError("Error in hiprtc: unable to fill mangled names"); + if (fgpu_rdc_) { + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { + LogError("Error in hiprtc: unable to compile source to bitcode"); return false; } - - return true; - } - - std::string linkFileName = "linked"; - if (!addCodeObjData(link_input_, LLVMBitcode_, linkFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add linked code object"); - return false; - } - - std::vector LinkedLLVMBitcode; - if (!linkLLVMBitcode(link_input_, isa_, link_options_, build_log_, LinkedLLVMBitcode)) { - LogError("Error in hiprtc: unable to add device libs to linked bitcode"); - return false; - } - - std::string linkedFileName = "LLVMBitcode.bc"; - if (!addCodeObjData(exec_input_, LinkedLLVMBitcode, linkedFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add device libs linked code object"); - return false; - } - - std::vector exe_options; - // Find the options passed by the app which can be used during BC to Relocatable phase. - if (!findExeOptions(options, exe_options)) { - LogError("Error in hiprtc: unable to find executable options"); - return false; - } - - std::vector exeOpts(exe_options_); - exeOpts.reserve(exeOpts.size() + exe_options.size() + 2); - // Add these below options by default for optimizations during BC to Relocatable phase. - exeOpts.push_back("-mllvm"); - exeOpts.push_back("-amdgpu-internalize-symbols"); - // User provided options are appended at the end since they can override the above - // default options if necessary - exeOpts.insert(exeOpts.end(), exe_options.begin(), exe_options.end()); - - if (settings_.dumpISA) { - if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { - LogError("Error in hiprtc: unable to dump isa code"); + } else { + LogInfo("Using the new path of comgr"); + if (!compileToExecutable(compile_input_, isa_, compileOpts, link_options_, build_log_, + executable_)) { + LogError("Failing to compile to realloc"); return false; } } - if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { - LogError("Error in hiprtc: unable to create executable"); - return false; - } - if (!mangled_names_.empty()) { - if (!fillMangledNames(executable_, mangled_names_, false)) { + auto& compile_step_output = fgpu_rdc_ ? LLVMBitcode_ : executable_; + if (!fillMangledNames(compile_step_output, mangled_names_, fgpu_rdc_)) { LogError("Error in hiprtc: unable to fill mangled names"); return false; } @@ -380,6 +324,7 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return true; } + void RTCCompileProgram::stripNamedExpression(std::string& strippedName) { if (strippedName.back() == ')') { strippedName.pop_back(); @@ -453,7 +398,6 @@ RTCLinkProgram::RTCLinkProgram(std::string name) : RTCProgram(name) { bool RTCLinkProgram::AddLinkerOptions(unsigned int num_options, hiprtcJIT_option* options_ptr, void** options_vals_ptr) { for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) { - switch (options_ptr[opt_idx]) { case HIPRTC_JIT_MAX_REGISTERS: link_args_.max_registers_ = *(reinterpret_cast(&options_vals_ptr[opt_idx])); diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp index a1965d1b19..aae8b64f47 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.hpp +++ b/hipamd/src/hiprtc/hiprtcInternal.hpp @@ -76,8 +76,9 @@ static amd::Monitor g_hiprtcInitlock{"hiprtcInit lock"}; #define HIPRTC_INIT_API_INTERNAL(...) \ amd::Thread* thread = amd::Thread::current(); \ if (!VDI_CHECK_THREAD(thread)) { \ - ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, "An internal error has occurred." \ - " This may be due to insufficient memory."); \ + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, \ + "An internal error has occurred." \ + " This may be due to insufficient memory."); \ HIPRTC_RETURN(HIPRTC_ERROR_INTERNAL_ERROR); \ } \ amd::ScopedLock lock(g_hiprtcInitlock); \ @@ -107,7 +108,6 @@ static void crashWithMessage(std::string message) { } struct Settings { - bool dumpISA{false}; bool offloadArchProvided{false}; }; @@ -131,7 +131,6 @@ class RTCProgram { std::vector executable_; amd_comgr_data_set_t exec_input_; - std::vector exe_options_; }; class RTCCompileProgram : public RTCProgram { @@ -156,10 +155,8 @@ class RTCCompileProgram : public RTCProgram { bool addBuiltinHeader(); bool transformOptions(std::vector& compile_options); bool findExeOptions(const std::vector& options, - std::vector& exe_options); - void AppendCompileOptions() { - AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_); - } + std::vector& exe_options); + void AppendCompileOptions() { AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_); } RTCCompileProgram() = delete; RTCCompileProgram(RTCCompileProgram&) = delete; @@ -288,9 +285,7 @@ class RTCLinkProgram : public RTCProgram { bool AddLinkerData(void* image_ptr, size_t image_size, std::string link_file_name, hiprtcJITInputType input_type); bool LinkComplete(void** bin_out, size_t* size_out); - void AppendLinkerOptions() { - AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_); - } + void AppendLinkerOptions() { AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_); } }; // Thread Local Storage Variables Aggregator Class diff --git a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp index 0e7de54e0d..84ac050c77 100644 --- a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp +++ b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp @@ -760,8 +760,10 @@ void OCLPerfCounters::run(void) { _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput, &setClockModeOutput); - _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA, + error_ = _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA, sizeof(cl_ulong), &result, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, + "clGetPerfCounterInfoAMD failed (Hint (Linux): install hsa-amd-aqlprofile)\n"); err = _wrapper->clReleasePerfCounterAMD(perfCounter); CHECK_RESULT(err != CL_SUCCESS, "Release PerfCounter failed\n"); diff --git a/rocclr/cmake/ROCclrPAL.cmake b/rocclr/cmake/ROCclrPAL.cmake index 3cc497e0fa..da04463651 100644 --- a/rocclr/cmake/ROCclrPAL.cmake +++ b/rocclr/cmake/ROCclrPAL.cmake @@ -43,6 +43,8 @@ set(PAL_BUILD_NAVI31 ON) set(PAL_BUILD_NAVI32 ON) set(PAL_BUILD_NAVI33 ON) set(PAL_BUILD_PHOENIX1 ON) +set(PAL_BUILD_STRIX1 ON) +set(PAL_BUILD_STRIX_HALO ON) find_package(AMD_PAL) find_package(AMD_HSA_LOADER) diff --git a/rocclr/device/comgrctx.cpp b/rocclr/device/comgrctx.cpp index 67184ca944..1a8afe60a9 100644 --- a/rocclr/device/comgrctx.cpp +++ b/rocclr/device/comgrctx.cpp @@ -57,9 +57,11 @@ bool Comgr::LoadLib(bool is_versioned) { cep_.handle = Os::loadLibrary(comgr_lib_name); #endif } else { - static constexpr const char* comgr_lib_name = + std::string comgr_major_dll = "amd_comgr_" + + std::to_string(AMD_COMGR_INTERFACE_VERSION_MAJOR) + ".dll"; + static const char* comgr_lib_name = LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so.2"), - WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so.2")); + WINDOWS_SWITCH(comgr_major_dll.c_str(), "libamd_comgr.so.2")); cep_.handle = Os::loadLibrary(comgr_lib_name); } if (nullptr == cep_.handle) { diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index ac91003104..c52c2e0425 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -219,8 +219,8 @@ std::pair Isa::supportedIsas() { {"gfx1101", "gfx1101", true, true, 11, 0, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1102", "gfx1102", true, true, 11, 0, 2, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1103", "gfx1103", true, true, 11, 0, 3, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, - {"gfx1150", "gfx1150", true, true, 11, 5, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, - {"gfx1151", "gfx1151", true, true, 11, 5, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, + {"gfx1150", "gfx1150", true, true, 11, 5, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, //Strix + {"gfx1151", "gfx1151", true, true, 11, 5, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, //Strix Halo }; return std::make_pair(std::begin(supportedIsas_), std::end(supportedIsas_)); } @@ -855,7 +855,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me auto dev_mem = static_cast(amd_mem_obj->getDeviceMemory(*this)); auto result = dev_mem->ExportHandle(handle); - return true; + return result; } // ================================================================================================ diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index d00dfb4f6c..2dc7b09ee6 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -624,6 +624,8 @@ struct Info : public amd::EmbeddedObject { //! global CU mask which will be applied to all queues created on this device std::vector globalCUMask_; + bool accelerator_; //!< Accelerator or discrete graphics card. + //! AQL Barrier Value Packet support bool aqlBarrierValue_; diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 6f65ad422b..4d9df9d111 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -113,6 +113,8 @@ static constexpr PalDevice supportedPalDevices[] = { {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix2}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::HawkPoint1}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::HawkPoint2}, + {11, 5, 0, Pal::GfxIpLevel::GfxIp11_5, "gfx1150", Pal::AsicRevision::Strix1}, + {11, 5, 1, Pal::GfxIpLevel::GfxIp11_5, "gfx1151", Pal::AsicRevision::StrixHalo}, }; static std::tuple findIsa(Pal::AsicRevision asicRevision, @@ -2601,15 +2603,6 @@ bool Device::createBlitProgram() { } } - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - result = false; - } - if (settings().useLightning_) { const std::string TrapHandlerAsm = TrapHandlerCode; // Create a program for trap handler @@ -2635,6 +2628,15 @@ bool Device::createBlitProgram() { DevLogPrintfError("Trap handler creation failed\n"); } } + + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + result = false; + } return result; } diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp index a5d08c50be..ee85a1cd87 100644 --- a/rocclr/device/pal/palsettings.cpp +++ b/rocclr/device/pal/palsettings.cpp @@ -170,6 +170,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp, amd::Os::getAppPathAndFileName(appName, appPathAndName); switch (palProp.revision) { + case Pal::AsicRevision::StrixHalo: + case Pal::AsicRevision::Strix1: // Fall through for Navi3x ... case Pal::AsicRevision::Navi33: case Pal::AsicRevision::Navi32: diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 3b6b4d4d54..8758c62913 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -1836,6 +1836,19 @@ bool Device::populateOCLDeviceConstants() { std::numeric_limits::max(); // gfx10+ does not share SGPRs between waves } + uint8_t memory_properties[8]; + // Get the memory property from ROCr. + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_, + (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES, + memory_properties)) { + LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed"); + } + + // Check if the device is APU + if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) { + info_.accelerator_ = 1; + } + return true; } diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 2ef82fec0c..9887d23334 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -682,15 +682,12 @@ void Buffer::destroy() { } if (deviceMemory_ != nullptr) { + bool needUnlockHostMem = false; if (deviceMemory_ != owner()->getHostMem()) { // if they are identical, the host pointer will be // deallocated later on => avoid double deallocation if (isHostMemDirectAccess()) { - if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - if (dev().agent_profile() != HSA_PROFILE_FULL) { - hsa_amd_memory_unlock(owner()->getHostMem()); - } - } + needUnlockHostMem = true; } else { dev().memFree(deviceMemory_, size()); const_cast(dev()).updateFreeMemory(size(), true); @@ -705,6 +702,15 @@ void Buffer::destroy() { } else if ((memFlags & CL_MEM_ALLOC_HOST_PTR) && (owner()->getContext().devices().size() == 1)) { dev().hostFree(deviceMemory_, size()); + } else if (isHostMemDirectAccess()) { + needUnlockHostMem = true; + } + } + + if (needUnlockHostMem) { + if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { + if (dev().agent_profile() != HSA_PROFILE_FULL) + hsa_amd_memory_unlock(owner()->getHostMem()); } } } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 0046f130b3..42eb546eab 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2857,50 +2857,43 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) } uint64_t vqVA = reinterpret_cast(vqMem->getDeviceMemory()); - uint64_t pattern = 0; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(virtualQueue_->getSize(), 1, 1); - if (!dev().xferMgr().fillBuffer(*vqMem, &pattern, sizeof(pattern), region, origin, region)) { - return false; - } + // Use shadow to prepare the data structure in host. + auto shadow = std::make_unique(allocSize); - AmdVQueueHeader header = {}; + std::memset(&shadow[0], 0, allocSize); + + AmdVQueueHeader* header = reinterpret_cast(&shadow[0]); // Initialize the virtual queue header - header.aql_slot_num = numSlots; - header.event_slot_num = dev().settings().numDeviceEvents_; - header.event_slot_mask = vqVA + eventMaskOffs; - header.event_slots = vqVA + eventsOffs; - header.aql_slot_mask = vqVA + slotMaskOffs; - header.wait_size = dev().settings().numWaitEvents_; - header.arg_size = dev().info().maxParameterSize_ + 64; - header.mask_groups = maskGroups_; - - amd::Coord3D origin_header(0); - amd::Coord3D region_header(sizeof(AmdVQueueHeader)); - - if (!dev().xferMgr().writeBuffer(&header, *vqMem, origin_header, region_header)) { - return false; - } + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vqVA + eventMaskOffs; + header->event_slots = vqVA + eventsOffs; + header->aql_slot_mask = vqVA + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; // Go over all slots and perform initialization - AmdAqlWrap slot = {}; size_t offset = sizeof(AmdVQueueHeader); for (uint i = 0; i < numSlots; ++i) { + AmdAqlWrap * slot = reinterpret_cast(&shadow[0] + offset); uint64_t argStart = vqVA + argOffs + i * singleArgSize; - amd::Coord3D origin_slot(offset); - amd::Coord3D region_slot(sizeof(AmdAqlWrap)); - - slot.aql.kernarg_address = reinterpret_cast(argStart); - slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; - if (!dev().xferMgr().writeBuffer(&slot, *vqMem, origin_slot, region_slot)) { - return false; - } + slot->aql.kernarg_address = reinterpret_cast(argStart); + slot->wait_list = argStart + dev().info().maxParameterSize_ + 64; offset += sizeof(AmdAqlWrap); } + amd::Coord3D origin (0, 0, 0); + amd::Coord3D region (allocSize, 1, 1); + + // copy the data structure from host to GPU + if (!dev().xferMgr().writeBuffer(&shadow[0], *vqMem, origin, region)) { + return false; + } + deviceQueueSize_ = deviceQueueSize; schedulerThreads_ = numSlots / (DeviceQueueMaskSize * maskGroups_); @@ -3643,6 +3636,11 @@ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // one to get the profile object amd::PerfCounter* amdCounter = static_cast(counters[0]); PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + if (counter == nullptr) { + LogError("Invalid Performance Counter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } PerfCounterProfile* profileRef = counter->profileRef(); // create the AQL packet for stop profiling diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 51ebdd0d39..cd1f40c42d 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -245,6 +245,8 @@ release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \ "Set link options needed for hiprtc compilation") \ release(bool, HIP_VMEM_MANAGE_SUPPORT, true, \ "Virtual Memory Management Support") \ +release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \ + "Enable/Disable graph debug dot print dump") \ namespace amd {