Skip to content

Commit f5ca620

Browse files
committed
SWDEV-423835 - Fixing kernel launch issues on Virtual Memory Management path.
Change-Id: I9f5e8a3d83af3809b2c50b21a10697e26113dd23
1 parent dd43dc9 commit f5ca620

File tree

10 files changed

+87
-83
lines changed

10 files changed

+87
-83
lines changed

hipamd/src/hip_graph_internal.hpp

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,13 +2104,8 @@ class GraphMemAllocNode final : public GraphNode {
21042104
// Retain memory object because command release will release it
21052105
memory_->retain();
21062106
size_ = aligned_size;
2107-
// Save geenric allocation info to match VM interfaces
2108-
memory_->getUserData().data = new hip::MemMapAllocUserData(dptr, aligned_size, va_);
21092107
// Execute the original mapping command
21102108
VirtualMapCommand::submit(device);
2111-
// Update the internal svm address to ptr
2112-
memory()->setSvmPtr(va_->getSvmPtr());
2113-
// Can't destroy VA, because it's used in mapping even if the node will be destroyed
21142109
va_->retain();
21152110
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute: %p, %p",
21162111
va_->getSvmPtr(), memory());
@@ -2234,24 +2229,21 @@ class GraphMemFreeNode : public GraphNode {
22342229

22352230
virtual void submit(device::VirtualDevice& device) final {
22362231
// Find memory object before unmap logic
2237-
auto alloc = amd::MemObjMap::FindMemObj(ptr());
2232+
auto vaddr_mem_obj = amd::MemObjMap::FindMemObj(ptr());
2233+
amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
2234+
assert(phys_mem_obj != nullptr);
22382235
VirtualMapCommand::submit(device);
2239-
// Restore the original address of the generic allocation
2240-
auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(alloc->getUserData().data);
2241-
alloc->setSvmPtr(ga->ptr_);
22422236
if (!AMD_DIRECT_DISPATCH) {
22432237
// Update the current device, since hip event, used in mem pools, requires device
22442238
hip::setCurrentDevice(device_id_);
22452239
}
22462240
// Free virtual address
2247-
ga->va_->release();
2248-
alloc->getUserData().data = nullptr;
2241+
vaddr_mem_obj->release();
22492242
// Release the allocation back to graph's pool
2250-
graph_->FreeMemory(ga->ptr_, static_cast<hip::Stream*>(queue()));
2251-
amd::MemObjMap::AddMemObj(ptr(), ga->va_);
2252-
delete ga;
2243+
graph_->FreeMemory(phys_mem_obj->getSvmPtr(), static_cast<hip::Stream*>(queue()));
2244+
amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
22532245
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p",
2254-
ptr(), alloc);
2246+
ptr(), vaddr_mem_obj);
22552247
}
22562248

22572249
private:

hipamd/src/hip_mempool_impl.cpp

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,10 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
225225
{
226226
amd::ScopedLock lock(lock_pool_ops_);
227227

228+
if (memory->getUserData().phys_mem_obj != nullptr) {
229+
memory = memory->getUserData().phys_mem_obj;
230+
}
231+
228232
// If the free heap grows over the busy heap, then force release
229233
if (AMD_DIRECT_DISPATCH && (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize())) {
230234
// Use event base release to reduce memory pressure
@@ -249,22 +253,14 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
249253
}
250254
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory);
251255

252-
auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(memory->getUserData().data);
253-
if (ga != nullptr) {
254-
if (stream == nullptr) {
256+
if (stream == nullptr) {
255257
stream = g_devices[memory->getUserData().deviceId]->NullStream();
256-
}
257-
// Unmap virtual address from memory
258-
auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
259-
memory->getSvmPtr(), ga->size_, nullptr);
260-
cmd->enqueue();
261-
cmd->release();
262-
memory->setSvmPtr(ga->ptr_);
263-
// Free virtual address and destroy generic allocation object
264-
ga->va_->release();
265-
delete ga;
266-
memory->getUserData().data = nullptr;
267258
}
259+
// Unmap virtual address from memory
260+
auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
261+
memory->getSvmPtr(), memory->getSize(), nullptr);
262+
cmd->enqueue();
263+
cmd->release();
268264

269265
if (stream != nullptr) {
270266
// The stream of destruction is a safe stream, because the app must handle sync

hipamd/src/hip_vm.cpp

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,15 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
120120

121121
// Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations.
122122
size_t offset = 0; //this is ignored
123-
amd::Memory* memObj = getMemoryObject(ptr, offset);
123+
amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset);
124124
//saves the current device id so that it can be accessed later
125-
memObj->getUserData().deviceId = prop->location.id;
126-
memObj->getUserData().data = new hip::GenericAllocation(ptr, size, *prop);
127-
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(memObj->getUserData().data);
125+
phys_mem_obj->getUserData().deviceId = prop->location.id;
126+
phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop);
127+
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(phys_mem_obj->getUserData().data);
128+
129+
// Remove because the entry of 0x1 is not needed in MemObjMap.
130+
// We save the copy of Phy mem obj in virtual mem obj during mapping.
131+
amd::MemObjMap::RemoveMemObj(ptr);
128132

129133
HIP_RETURN(hipSuccess);
130134
}
@@ -225,9 +229,6 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
225229
cmd->awaitCompletion();
226230
cmd->release();
227231

228-
// update the internal svm address to ptr
229-
ga->asAmdMemory().setSvmPtr(ptr);
230-
231232
HIP_RETURN(hipSuccess);
232233
}
233234

@@ -268,7 +269,8 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
268269
HIP_RETURN(hipErrorInvalidValue);
269270
}
270271

271-
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(mem->getUserData().data);
272+
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(
273+
mem->getUserData().phys_mem_obj->getUserData().data);
272274

273275
if (*handle == nullptr) {
274276
HIP_RETURN(hipErrorInvalidValue);
@@ -312,17 +314,17 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
312314
HIP_RETURN(hipErrorInvalidValue);
313315
}
314316

315-
amd::Memory* pa = amd::MemObjMap::FindMemObj(ptr);
316-
if (pa == nullptr) {
317+
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(ptr);
318+
if (vaddr_mem_obj == nullptr && vaddr_mem_obj->getSize() != size) {
317319
HIP_RETURN(hipErrorInvalidValue);
318320
}
319321

320-
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(ptr);
321-
if (va == nullptr && va->getSize() != size) {
322+
amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
323+
if (phys_mem_obj == nullptr) {
322324
HIP_RETURN(hipErrorInvalidValue);
323325
}
324326

325-
auto& queue = *g_devices[pa->getUserData().deviceId]->NullStream();
327+
auto& queue = *g_devices[phys_mem_obj->getUserData().deviceId]->NullStream();
326328

327329
amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size,
328330
nullptr);
@@ -331,9 +333,8 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
331333
cmd->release();
332334

333335
// restore the original pa of the generic allocation
334-
hip::GenericAllocation* ga = reinterpret_cast<hip::GenericAllocation*>(pa->getUserData().data);
335-
pa->setSvmPtr(ga->genericAddress());
336-
336+
hip::GenericAllocation* ga
337+
= reinterpret_cast<hip::GenericAllocation*>(phys_mem_obj->getUserData().data);
337338
ga->release();
338339

339340
HIP_RETURN(hipSuccess);

hipamd/src/hip_vm.hpp

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,35 +30,23 @@ namespace hip {
3030

3131
hipError_t ihipFree(void* ptr);
3232

33-
struct MemMapAllocUserData {
34-
void* ptr_; // Original pointer of the allocation
35-
size_t size_; // Aligned size of the allocation
36-
amd::Memory* va_; // Memory object for the virtual address
37-
38-
MemMapAllocUserData(void* ptr, size_t size, amd::Memory* va) : ptr_(ptr), size_(size), va_(va) {}
39-
};
40-
4133
class GenericAllocation : public amd::RuntimeObject {
42-
void* ptr_; //<! Device ptr
34+
amd::Memory& phys_mem_ref_; //<! Physical memory object
4335
size_t size_; //<! Allocated size
4436
hipMemAllocationProp properties_; //<! Allocation Properties
4537

4638
public:
47-
GenericAllocation(void* ptr, size_t size, const hipMemAllocationProp& prop)
48-
: ptr_(ptr), size_(size), properties_(prop) {}
49-
~GenericAllocation() {
50-
hipError_t err = ihipFree(ptr_);
51-
}
39+
GenericAllocation(amd::Memory& phys_mem_ref, size_t size, const hipMemAllocationProp& prop)
40+
: phys_mem_ref_(phys_mem_ref), size_(size), properties_(prop) {}
41+
~GenericAllocation() {}
5242

5343
const hipMemAllocationProp& GetProperties() const { return properties_; }
5444
hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() {
5545
return reinterpret_cast<hipMemGenericAllocationHandle_t>(this);
5646
}
5747
amd::Memory& asAmdMemory() {
58-
size_t discardOffset;
59-
return *getMemoryObject(genericAddress(), discardOffset);
48+
return phys_mem_ref_;
6049
}
61-
void* genericAddress() const { return ptr_; }
6250

6351
virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; }
6452
};

rocclr/device/pal/palvirtual.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2192,18 +2192,18 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
21922192
amd::ScopedLock lock(execution());
21932193

21942194
profilingBegin(vcmd);
2195-
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
2196-
if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
2195+
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
2196+
if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
21972197
profilingEnd(vcmd);
21982198
return;
21992199
}
2200-
pal::Memory* vaRange = dev().getGpuMemory(va);
2201-
Pal::IGpuMemory* memory = (vcmd.memory() == nullptr) ?
2200+
pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_mem_obj);
2201+
Pal::IGpuMemory* phymem_igpu_mem = (vcmd.memory() == nullptr) ?
22022202
nullptr : dev().getGpuMemory(vcmd.memory())->iMem();
22032203
Pal::VirtualMemoryRemapRange range{
2204-
vaRange->iMem(),
2204+
vaddr_pal_mem->iMem(),
22052205
0,
2206-
memory,
2206+
phymem_igpu_mem,
22072207
0,
22082208
vcmd.size(),
22092209
Pal::VirtualGpuMemAccessMode::NoAccess
@@ -2224,13 +2224,15 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
22242224
setGpuEvent(event);
22252225
if (result == Pal::Result::Success) {
22262226
if (vcmd.memory() != nullptr) {
2227-
// assert the va wasn't mapped already
2227+
// assert the vaddr_mem_obj wasn't mapped already
22282228
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
2229-
amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
2229+
amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
2230+
vaddr_mem_obj->getUserData().phys_mem_obj = vcmd.memory();
22302231
} else {
2231-
// assert the va is mapped and needs to be removed
2232+
// assert the vaddr_mem_obj is mapped and needs to be removed
22322233
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
22332234
amd::MemObjMap::RemoveMemObj(vcmd.ptr());
2235+
vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
22342236
}
22352237
}
22362238
profilingEnd(vcmd);

rocclr/device/rocm/rocdevice.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2301,6 +2301,16 @@ uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const {
23012301
return hsa_vmem_handle.handle;
23022302
}
23032303

2304+
void Device::deviceVmemRelease(uint64_t mem_handle) const {
2305+
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {};
2306+
hsa_vmem_handle.handle = mem_handle;
2307+
2308+
hsa_status_t hsa_status = hsa_amd_vmem_handle_release(hsa_vmem_handle);
2309+
if (hsa_status != HSA_STATUS_SUCCESS) {
2310+
LogPrintfError("Failed hsa_amd_vmem_handle_release! Failed with hsa status: %d \n", hsa_status);
2311+
}
2312+
}
2313+
23042314
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const {
23052315
const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_
23062316
: (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_;
@@ -2381,7 +2391,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
23812391
return nullptr;
23822392
}
23832393

2384-
if (mem->getSvmPtr() != nullptr) {
2394+
if (mem->getSvmPtr() != nullptr || mem->getMemFlags() & ROCCLR_MEM_PHYMEM) {
23852395
// add the information to context so that we can use it later.
23862396
amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
23872397
}

rocclr/device/rocm/rocdevice.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ class Device : public NullDevice {
450450
bool deviceAllowAccess(void* dst) const;
451451

452452
bool allowPeerAccess(device::Memory* memory) const;
453+
void deviceVmemRelease(uint64_t mem_handle) const;
453454
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
454455
void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const;
455456

rocclr/device/rocm/rocmemory.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,12 @@ void Buffer::destroy() {
648648
}
649649
const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;
650650

651+
if (memFlags & ROCCLR_MEM_PHYMEM) {
652+
// If this is physical memory, dont call hsa free function, since device mem was never created
653+
dev().deviceVmemRelease(owner()->getUserData().hsa_handle);
654+
return;
655+
}
656+
651657
if (kind_ != MEMORY_KIND_PTRGIVEN) {
652658
if (isFineGrain) {
653659
if (memFlags & CL_MEM_ALLOC_HOST_PTR) {
@@ -767,7 +773,10 @@ bool Buffer::create(bool alloc_local) {
767773
owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0);
768774
if (owner()->getUserData().hsa_handle == 0) {
769775
LogError("HSA Opaque Handle returned was null");
776+
return false;
770777
}
778+
deviceMemory_ = reinterpret_cast<void*>(amd::Memory::MemoryType::kPhyMemHandlePtr);
779+
return true;
771780
}
772781

773782
if ((owner()->parent() == nullptr) &&

rocclr/device/rocm/rocvirtual.cpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2589,36 +2589,39 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
25892589

25902590
profilingBegin(vcmd);
25912591

2592-
// Find the amd::Memory object for virtual ptr.
2593-
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
2594-
if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
2592+
// Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
2593+
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
2594+
if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
25952595
profilingEnd(vcmd);
25962596
return;
25972597
}
25982598

25992599
// Get the amd::Memory object for the physical address
2600-
amd::Memory* pa = vcmd.memory();
2600+
amd::Memory* phys_mem_obj = vcmd.memory();
26012601
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
26022602

26032603
// If Physical address is not set, then it is map command. If set, it is unmap command.
2604-
if (pa != nullptr) {
2604+
if (phys_mem_obj != nullptr) {
26052605
// Map the physical to virtual address the hsa api
26062606
hsa_amd_vmem_alloc_handle_t opaque_hsa_handle;
2607-
opaque_hsa_handle.handle = pa->getUserData().hsa_handle;
2608-
if ((hsa_status = hsa_amd_vmem_map(va->getSvmPtr(), va->getSize(), va->getOffset(),
2609-
opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
2607+
opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle;
2608+
if ((hsa_status = hsa_amd_vmem_map(vaddr_mem_obj->getSvmPtr(), vcmd.size(),
2609+
vaddr_mem_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
26102610
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
26112611
// Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap.
2612-
amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
2612+
amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
2613+
vaddr_mem_obj->getUserData().phys_mem_obj = phys_mem_obj;
26132614
} else {
26142615
LogError("HSA Command: hsa_amd_vmem_map failed!");
26152616
}
26162617
} else {
26172618
// Unmap the object, since the physical addr is set.
2618-
if ((hsa_status = hsa_amd_vmem_unmap(va->getSvmPtr(), va->getSize())) == HSA_STATUS_SUCCESS) {
2619+
if ((hsa_status = hsa_amd_vmem_unmap(vaddr_mem_obj->getSvmPtr(), vcmd.size()))
2620+
== HSA_STATUS_SUCCESS) {
26192621
// assert the va is mapped and needs to be removed
26202622
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
26212623
amd::MemObjMap::RemoveMemObj(vcmd.ptr());
2624+
vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
26222625
} else {
26232626
LogError("HSA Command: hsa_amd_vmem_unmap failed");
26242627
}

rocclr/platform/memory.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,15 @@ class Memory : public amd::RuntimeObject {
142142
public:
143143
enum MemoryType {
144144
kSvmMemoryPtr = 0x1,
145-
kArenaMemoryPtr = 0x100
145+
kArenaMemoryPtr = 0x100,
146+
kPhyMemHandlePtr = 0x101
146147
};
147148

148149
struct UserData
149150
{
150151
int deviceId = 0; //!< Device ID memory is allocated on
151152
void* data = nullptr; //!< Opaque user data from CL or HIP or etc.
153+
amd::Memory* phys_mem_obj = nullptr; //<! Physical mem obj, only set on virtual mem
152154
uint64_t hsa_handle = 0; //!<Opaque hsa handle saved for Virtual memories
153155
unsigned int flags = 0; //!< HIP memory flags
154156
//! hipMallocPitch allocates buffer using width & height and returns pitch & device pointer.

0 commit comments

Comments
 (0)