Skip to content

Commit 3f5698c

Browse files
authored
Fix data race in VMM APIs (#3889)
## Motivation Data race fixed by adding missing memory_lock_ acquisition ## Technical Details Four virtual memory management functions in runtime.cpp were accessing shared data structures (memory_handle_map_, mapped_handle_map_) without holding the memory_lock_, causing data races under concurrent access. Add std::lock_guard<std::shared_mutex> to: - VMemoryExportShareableHandle: reads memory_handle_map_ - VMemoryImportShareableHandle: reads/writes memory_handle_map_ - VMemoryRetainAllocHandle: reads mapped_handle_map_, modifies ref_count - VMemoryGetAllocPropertiesFromHandle: reads memory_handle_map_ This is consistent with all other VMM functions in the file which already acquire memory_lock_ before accessing these shared maps. ## JIRA ID Resolves ROCM-3902 ## Test Plan Tested with rccl-tests workloads ## Test Result No data race seen. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
1 parent 7ee0ce2 commit 3f5698c

File tree

1 file changed

+4
-0
lines changed
  • projects/rocr-runtime/runtime/hsa-runtime/core/runtime

1 file changed

+4
-0
lines changed

projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4038,6 +4038,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t*
40384038
hsa_status_t Runtime::VMemoryExportShareableHandle(int* dmabuf_fd,
40394039
hsa_amd_vmem_alloc_handle_t handle,
40404040
uint64_t flags) {
4041+
std::lock_guard<std::shared_mutex> lock(memory_lock_);
40414042
*dmabuf_fd = -1;
40424043
auto memoryHandle = memory_handle_map_.find(MemoryHandle::Convert(handle));
40434044
if (memoryHandle == memory_handle_map_.end()) {
@@ -4056,6 +4057,7 @@ hsa_status_t Runtime::VMemoryExportShareableHandle(int* dmabuf_fd,
40564057

40574058
hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
40584059
hsa_amd_vmem_alloc_handle_t* memoryOnlyHandle) {
4060+
std::lock_guard<std::shared_mutex> lock(memory_lock_);
40594061
auto lookupRegion = [this](int nodeid, const AMD::MemoryRegion** ret) {
40604062
auto nodeAgent = agents_by_node_.find(nodeid);
40614063
if (nodeAgent == agents_by_node_.end()) {
@@ -4122,6 +4124,7 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
41224124

41234125
hsa_status_t Runtime::VMemoryRetainAllocHandle(hsa_amd_vmem_alloc_handle_t* mapped_handle,
41244126
void* va) {
4127+
std::lock_guard<std::shared_mutex> lock(memory_lock_);
41254128
auto mappedHandleIt = mapped_handle_map_.find(va);
41264129
if (mappedHandleIt == mapped_handle_map_.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
41274130

@@ -4135,6 +4138,7 @@ hsa_status_t Runtime::VMemoryRetainAllocHandle(hsa_amd_vmem_alloc_handle_t* mapp
41354138
hsa_status_t Runtime::VMemoryGetAllocPropertiesFromHandle(hsa_amd_vmem_alloc_handle_t allocHandle,
41364139
const core::MemoryRegion** mem_region,
41374140
hsa_amd_memory_type_t* type) {
4141+
std::lock_guard<std::shared_mutex> lock(memory_lock_);
41384142
auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(allocHandle));
41394143
if (memoryHandleIt == memory_handle_map_.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
41404144

0 commit comments

Comments
 (0)