Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit 1fbba04

Browse files
committed
[perf]hipMalloc performance optimization
Change-Id: I6e8a918cc1c4cafad197b09e10755cd180e11ead
1 parent 0b6971e commit 1fbba04

File tree

5 files changed

+125
-9
lines changed

5 files changed

+125
-9
lines changed

device/device.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,25 @@ amd::Memory* MemObjMap::FindMemObj(const void* k) {
130130
}
131131
}
132132

133+
void MemObjMap::UpdateAccess(amd::Device *peerDev) {
134+
if (peerDev == nullptr) {
135+
return;
136+
}
137+
138+
// Provides access to all memory allocated on peerDev but
139+
// hsa_amd_agents_allow_access was not called because there was no peer
140+
amd::ScopedLock lock(AllocatedLock_);
141+
for (auto it : MemObjMap_) {
142+
const std::vector<Device*>& devices = it.second->getContext().devices();
143+
if (devices.size() == 1 && devices[0] == peerDev) {
144+
device::Memory* devMem = it.second->getDeviceMemory(*devices[0]);
145+
if (!devMem->getAllowedPeerAccess()) {
146+
peerDev->deviceAllowAccess(reinterpret_cast<void*>(it.first));
147+
devMem->setAllowedPeerAccess(true);
148+
}
149+
}
150+
}
151+
}
133152

134153
Device::BlitProgram::~BlitProgram() {
135154
if (program_ != nullptr) {

device/device.hpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -792,14 +792,25 @@ class Memory : public amd::HeapObject {
792792
//! Returns CPU pointer to HW state
793793
virtual const address cpuSrd() const { return nullptr; }
794794

795+
bool getAllowedPeerAccess() const { return (flags_ & AllowedPeerAccess) ? true : false; }
796+
void setAllowedPeerAccess(bool flag) {
797+
if (flag == true) {
798+
flags_ |= AllowedPeerAccess;
799+
}
800+
else {
801+
flags_ &= ~AllowedPeerAccess;
802+
}
803+
}
804+
795805
protected:
796806
enum Flags {
797807
HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory
798808
MapResourceAlloced = 0x00000002, //!< Map resource was allocated
799809
PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated
800810
SubMemoryObject = 0x00000008, //!< Memory is sub-memory
801811
HostMemoryRegistered = 0x00000010, //!< Host memory was registered
802-
MemoryCpuUncached = 0x00000020 //!< Memory is uncached on CPU access(slow read)
812+
MemoryCpuUncached = 0x00000020, //!< Memory is uncached on CPU access(slow read)
813+
AllowedPeerAccess = 0x00000040 //!< Memory can be accessed from peer
803814
};
804815
uint flags_; //!< Memory object flags
805816

@@ -1203,6 +1214,7 @@ class MemObjMap : public AllStatic {
12031214
static void RemoveMemObj(const void* k); //!< Remove an entry of mem object from the container
12041215
static amd::Memory* FindMemObj(
12051216
const void* k); //!< find the mem object based on the input pointer
1217+
static void UpdateAccess(amd::Device *peerDev);
12061218
private:
12071219
static std::map<uintptr_t, amd::Memory*>
12081220
MemObjMap_; //!< the mem object<->hostptr information container
@@ -1385,6 +1397,21 @@ class Device : public RuntimeObject {
13851397
return NULL;
13861398
}
13871399

1400+
virtual bool deviceAllowAccess(void* dst) const {
1401+
ShouldNotCallThis();
1402+
return true;
1403+
}
1404+
1405+
virtual bool enableP2P(amd::Device* ptrDev) {
1406+
ShouldNotCallThis();
1407+
return true;
1408+
}
1409+
1410+
virtual bool disableP2P(amd::Device* ptrDev) {
1411+
ShouldNotCallThis();
1412+
return true;
1413+
}
1414+
13881415
/**
13891416
* @copydoc amd::Context::hostFree
13901417
*/

device/rocm/rocdevice.cpp

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,6 +1666,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
16661666
return nullptr;
16671667
}
16681668

1669+
if (isP2pEnabled()) {
1670+
memory->setAllowedPeerAccess(true);
1671+
}
16691672
// Initialize if the memory is a pipe object
16701673
if (owner.getType() == CL_MEM_OBJECT_PIPE) {
16711674
// Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
@@ -1816,6 +1819,44 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
18161819

18171820
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
18181821

1822+
bool Device::enableP2P(amd::Device* ptrDev) {
1823+
assert(ptrDev != nullptr);
1824+
1825+
Device* peerDev = static_cast<Device*>(ptrDev);
1826+
if (std::find(enabled_p2p_devices_.begin(), enabled_p2p_devices_.end(), peerDev) ==
1827+
enabled_p2p_devices_.end()) {
1828+
enabled_p2p_devices_.push_back(peerDev);
1829+
// Update access to all old allocations
1830+
amd::MemObjMap::UpdateAccess(static_cast<amd::Device*>(this));
1831+
}
1832+
return true;
1833+
}
1834+
1835+
bool Device::disableP2P(amd::Device* ptrDev) {
1836+
assert(ptrDev != nullptr);
1837+
1838+
Device* peerDev = static_cast<Device*>(ptrDev);
1839+
//if device is present then remove
1840+
auto it = std::find(enabled_p2p_devices_.begin(), enabled_p2p_devices_.end(), peerDev);
1841+
if (it != enabled_p2p_devices_.end()) {
1842+
enabled_p2p_devices_.erase(it);
1843+
}
1844+
return true;
1845+
}
1846+
1847+
bool Device::deviceAllowAccess(void* ptr) const {
1848+
std::lock_guard<std::mutex> lock(lock_allow_access_);
1849+
if (!p2pAgents().empty()) {
1850+
hsa_status_t stat = hsa_amd_agents_allow_access(p2pAgents().size(),
1851+
p2pAgents().data(), nullptr, ptr);
1852+
if (stat != HSA_STATUS_SUCCESS) {
1853+
LogError("Allow p2p access");
1854+
return false;
1855+
}
1856+
}
1857+
return true;
1858+
}
1859+
18191860
void* Device::deviceLocalAlloc(size_t size, bool atomics) const {
18201861
const hsa_amd_memory_pool_t& pool = (atomics)? gpu_fine_grained_segment_ : gpuvm_segment_;
18211862

@@ -1832,15 +1873,11 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics) const {
18321873
return nullptr;
18331874
}
18341875

1835-
if (p2pAgents().size() > 0) {
1836-
stat = hsa_amd_agents_allow_access(p2pAgents().size(), p2pAgents().data(), nullptr, ptr);
1837-
if (stat != HSA_STATUS_SUCCESS) {
1838-
LogError("Allow p2p access for memory allocation");
1839-
memFree(ptr, size);
1840-
return nullptr;
1841-
}
1876+
if (isP2pEnabled() && deviceAllowAccess(ptr) == false) {
1877+
LogError("Allow p2p access for memory allocation");
1878+
memFree(ptr, size);
1879+
return nullptr;
18421880
}
1843-
18441881
return ptr;
18451882
}
18461883

device/rocm/rocdevice.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,16 @@ class NullDevice : public amd::Device {
219219
return false;
220220
}
221221

222+
virtual bool disableP2P(amd::Device* peerDev) {
223+
ShouldNotReachHere();
224+
return true;
225+
}
226+
227+
virtual bool enableP2P(amd::Device* peerDev) {
228+
ShouldNotReachHere();
229+
return true;
230+
}
231+
222232
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
223233

224234
protected:
@@ -369,6 +379,11 @@ class Device : public NullDevice {
369379

370380
virtual void hostFree(void* ptr, size_t size = 0) const;
371381

382+
virtual bool enableP2P(amd::Device* peerDev);
383+
virtual bool disableP2P(amd::Device* peerDev);
384+
385+
bool deviceAllowAccess(void* dst) const;
386+
372387
void* deviceLocalAlloc(size_t size, bool atomics = false) const;
373388

374389
void memFree(void* ptr, size_t size) const;
@@ -427,6 +442,9 @@ class Device : public NullDevice {
427442
// P2P agents avaialble for this device
428443
const std::vector<hsa_agent_t>& p2pAgents() const { return p2p_agents_; }
429444

445+
// User enabled peer devices
446+
const bool isP2pEnabled() const { return (enabled_p2p_devices_.size() > 0) ? true : false; }
447+
430448
// Update the global free memory size
431449
void updateFreeMemory(size_t size, bool free);
432450

@@ -497,6 +515,8 @@ class Device : public NullDevice {
497515

498516
hsa_agent_t cpu_agent_;
499517
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
518+
std::vector<Device*> enabled_p2p_devices_; //!< List of user enabled P2P devices for this device
519+
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
500520
hsa_agent_t _bkendDevice;
501521
hsa_agent_t* p2p_agents_list_;
502522
hsa_profile_t agent_profile_;

platform/command.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,22 @@ bool TransferBufferFileCommand::validateMemory() {
595595

596596
bool CopyMemoryP2PCommand::validateMemory() {
597597
amd::Device* queue_device = &queue()->device();
598+
598599
// Rocr backend maps memory from different devices by default and runtime doesn't need to track
599600
// extra memory objects. Also P2P staging buffer always allocated
600601
if (queue_device->settings().rocr_backend_) {
602+
// Explicit allow access is needed for P2P access
603+
const std::vector<Device*>& srcDevices = memory1_->getContext().devices();
604+
const std::vector<Device*>& dstDevices = memory2_->getContext().devices();
605+
if (srcDevices.size() == 1 && dstDevices.size() == 1) {
606+
device::Memory* mem2 = memory2_->getDeviceMemory(*dstDevices[0]);
607+
if (!mem2->getAllowedPeerAccess()) {
608+
void* dst = mem2->owner()->getSvmPtr();
609+
bool status = dstDevices[0]->deviceAllowAccess(dst);
610+
mem2->setAllowedPeerAccess(true);
611+
return status;
612+
}
613+
}
601614
return true;
602615
}
603616

0 commit comments

Comments
 (0)