Skip to content

Commit cdd8a0a

Browse files
saleelksatyanveshd
authored andcommitted
SWDEV-507534 SWDEV-504494 - Flush to systemscope when copying non-coherent mem
- When we use blit(compute) copies, two subsequent copies may read for the same source buffer, the buffer may get modified by the host in between and if the src buffer was allocated with non-coherent flag, the device may simply use stale value from previous cacheline fetch. This is a corner case. Change-Id: I2ce261c6f6fa4e5bb608f116548e5cc711ae6f3c
1 parent 0c88197 commit cdd8a0a

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

rocclr/device/rocm/rocblit.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
17071707
copySize = std::min(totalSize, maxStagedXferSize);
17081708
srcAddr += stagedCopyOffset;
17091709
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
1710-
"dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
1710+
"dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
17111711
// Flush caches for coherency after the copy as we need to std::memcpy
17121712
// from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
17131713
// itself that we can wait on without extra barrier packet.
@@ -1865,7 +1865,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
18651865
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
18661866
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
18671867
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
1868-
"dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
1868+
"dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
1869+
// No cache flush is needed here as we use a staging buffer, and the acquire logic
1870+
// ensures that the cacheline is different and re-used only when L2 is flushed
18691871
result = shaderCopyBuffer(dstAddr, stagingBuffer,
18701872
origin, srcOrigin, copySize,
18711873
entire, dev().settings().limit_blit_wg_, copyMetadata);
@@ -2253,6 +2255,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
22532255
}
22542256

22552257
if (!result) {
2258+
// Flush caches for coherency as the MTYPE of the src buffer may be
2259+
// non-coherent which mean we need to read it again from memory.
2260+
// Also if its a device to device copy(intra device), we dont need flush
2261+
// Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_
2262+
auto memFlags = srcMemory.owner()->getMemFlags();
2263+
bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
2264+
if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) {
2265+
gpu().addSystemScope();
2266+
}
22562267
result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
22572268
reinterpret_cast<address>(srcMemory.virtualAddress()),
22582269
dstOrigin, srcOrigin, sizeIn,

0 commit comments

Comments
 (0)