Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit 31db1a9

Browse files
committed
SWDEV-297448 - Add 64bit and 16bit write support
For the fillBuffer shader, if there are two 32bit writes to a MMIO register, it can get dropped. It has to be a single 64bit write. Add optimization to fillBuffer to write 64bit and 16bit writes. Change-Id: I3aa78e027898f8ae01e9c8f09004615673720c2b
1 parent 63893ed commit 31db1a9

File tree

7 files changed

+125
-56
lines changed

7 files changed

+125
-56
lines changed

device/blitcl.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ const char* BlitSourceCode = BLIT_KERNELS(
3131

3232
extern void __amd_copyBufferAligned(__global uint*, __global uint*, ulong, ulong, ulong, uint);
3333

34-
extern void __amd_fillBuffer(__global uchar*, __global uint*, __constant uchar*, uint, ulong,
35-
ulong);
34+
extern void __amd_fillBufferAligned(__global uchar*, __global ushort*, __global uint*, __global ulong*,
35+
__constant uchar*, uint, ulong, ulong);
3636

3737
__kernel void __amd_rocclr_copyBufferRect(__global uchar* src, __global uchar* dst, ulong4 srcRect,
3838
ulong4 dstRect, ulong4 size) {
@@ -54,11 +54,18 @@ const char* BlitSourceCode = BLIT_KERNELS(
5454
__amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment);
5555
}
5656

57-
__kernel void __amd_rocclr_fillBuffer(__global uchar* bufUChar, __global uint* bufUInt,
58-
__constant uchar* pattern, uint patternSize, ulong offset,
59-
ulong size) {
60-
__amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size);
61-
} extern void __amd_copyBufferToImage(__global uint*, __write_only image2d_array_t, ulong4,
57+
__kernel void __amd_rocclr_fillBufferAligned(__global uchar* bufUChar,
58+
__global ushort* bufUShort,
59+
__global uint* bufUInt,
60+
__global ulong* bufULong,
61+
__constant uchar* pattern,
62+
uint patternSize, ulong offset,
63+
ulong size) {
64+
__amd_fillBufferAligned(bufUChar, bufUShort, bufUInt, bufULong,
65+
pattern, patternSize, offset, size);
66+
}
67+
68+
extern void __amd_copyBufferToImage(__global uint*, __write_only image2d_array_t, ulong4,
6269
int4, int4, uint4, ulong4);
6370

6471
extern void __amd_copyImageToBuffer(__read_only image2d_array_t, __global uint*,

device/gpu/gpublit.cpp

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2034,21 +2034,40 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
20342034
synchronize();
20352035
return result;
20362036
} else {
2037-
uint fillType = FillBuffer;
2037+
uint fillType = FillBufferAligned;
20382038
size_t globalWorkOffset[3] = {0, 0, 0};
20392039
uint64_t fillSize = size[0] / patternSize;
20402040
size_t globalWorkSize = amd::alignUp(fillSize, 256);
20412041
size_t localWorkSize = 256;
2042-
bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
2042+
uint32_t alignment = (patternSize & 0x7) == 0 ?
2043+
sizeof(uint64_t) :
2044+
(patternSize & 0x3) == 0 ?
2045+
sizeof(uint32_t) :
2046+
(patternSize & 0x1) == 0 ?
2047+
sizeof(uint16_t) : sizeof(uint8_t);
20432048

20442049
// Program kernels arguments for the fill operation
20452050
Memory* mem = &gpuMem(memory);
2046-
if (dwordAligned) {
2047-
setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
2051+
if (alignment == sizeof(uint64_t)) {
2052+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
2053+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2054+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2055+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
2056+
} else if (alignment == sizeof(uint32_t)) {
2057+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
2058+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2059+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
2060+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
2061+
} else if (alignment == sizeof(uint16_t)) {
2062+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
20482063
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
2064+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2065+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
20492066
} else {
20502067
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
2051-
setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
2068+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2069+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2070+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
20522071
}
20532072
Memory* gpuCB = dev().getGpuMemory(constantBuffer_);
20542073
if (gpuCB == NULL) {
@@ -2057,15 +2076,15 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
20572076
void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly);
20582077
memcpy(constBuf, pattern, patternSize);
20592078
gpuCB->unmap(&gpu());
2060-
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
2079+
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &gpuCB);
20612080
uint64_t offset = origin[0];
2062-
if (dwordAligned) {
2063-
patternSize /= sizeof(uint32_t);
2064-
offset /= sizeof(uint32_t);
2065-
}
2066-
setArgument(kernels_[fillType], 3, sizeof(uint32_t), &patternSize);
2067-
setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
2068-
setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
2081+
2082+
patternSize/= alignment;
2083+
offset /= alignment;
2084+
2085+
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
2086+
setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
2087+
setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
20692088

20702089
// Create ND range object for the kernel's execution
20712090
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);

device/gpu/gpublit.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class KernelBlitManager : public DmaBlitManager {
220220
BlitCopyBufferRectAligned,
221221
BlitCopyBuffer,
222222
BlitCopyBufferAligned,
223-
FillBuffer,
223+
FillBufferAligned,
224224
FillImage,
225225
Scheduler,
226226
BlitTotal
@@ -416,9 +416,10 @@ class KernelBlitManager : public DmaBlitManager {
416416

417417
static const char* BlitName[KernelBlitManager::BlitTotal] = {
418418
"__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer",
419-
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned",
420-
"__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBuffer",
421-
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
419+
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect",
420+
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
421+
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
422+
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler"
422423
};
423424

424425
/*@}*/} // namespace gpu

device/pal/palblit.cpp

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2114,36 +2114,55 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
21142114
synchronize();
21152115
return result;
21162116
} else {
2117-
uint fillType = FillBuffer;
2117+
uint fillType = FillBufferAligned;
21182118
size_t globalWorkOffset[3] = {0, 0, 0};
21192119
uint64_t fillSize = size[0] / patternSize;
21202120
size_t globalWorkSize = amd::alignUp(fillSize, 256);
21212121
size_t localWorkSize = 256;
2122-
bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
2122+
uint32_t alignment = (patternSize & 0x7) == 0 ?
2123+
sizeof(uint64_t) :
2124+
(patternSize & 0x3) == 0 ?
2125+
sizeof(uint32_t) :
2126+
(patternSize & 0x1) == 0 ?
2127+
sizeof(uint16_t) : sizeof(uint8_t);
21232128

21242129
// Program kernels arguments for the fill operation
21252130
Memory* mem = &gpuMem(memory);
2126-
if (dwordAligned) {
2127-
setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
2131+
if (alignment == sizeof(uint64_t)) {
2132+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
2133+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2134+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2135+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
2136+
} else if (alignment == sizeof(uint32_t)) {
2137+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
2138+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2139+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
2140+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
2141+
} else if (alignment == sizeof(uint16_t)) {
2142+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
21282143
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
2144+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2145+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
21292146
} else {
21302147
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
2131-
setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
2148+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
2149+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
2150+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
21322151
}
21332152
Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
21342153
void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
21352154
memcpy(constBuf, pattern, patternSize);
21362155
gpuCB.unmap(&gpu());
21372156
Memory* pGpuCB = &gpuCB;
2138-
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &pGpuCB);
2157+
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &pGpuCB);
21392158
uint64_t offset = origin[0];
2140-
if (dwordAligned) {
2141-
patternSize /= sizeof(uint32_t);
2142-
offset /= sizeof(uint32_t);
2143-
}
2144-
setArgument(kernels_[fillType], 3, sizeof(uint32_t), &patternSize);
2145-
setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
2146-
setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
2159+
2160+
patternSize/= alignment;
2161+
offset /= alignment;
2162+
2163+
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
2164+
setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
2165+
setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
21472166

21482167
// Create ND range object for the kernel's execution
21492168
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);

device/pal/palblit.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class KernelBlitManager : public DmaBlitManager {
219219
BlitCopyBufferRectAligned,
220220
BlitCopyBuffer,
221221
BlitCopyBufferAligned,
222-
FillBuffer,
222+
FillBufferAligned,
223223
FillImage,
224224
Scheduler,
225225
GwsInit,
@@ -427,9 +427,11 @@ class KernelBlitManager : public DmaBlitManager {
427427

428428
static const char* BlitName[KernelBlitManager::BlitTotal] = {
429429
"__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer",
430-
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned",
431-
"__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBuffer",
432-
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit"
430+
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect",
431+
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
432+
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
433+
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
434+
"__amd_rocclr_gwsInit"
433435
};
434436

435437
/*@}*/ // namespace pal

device/rocm/rocblit.cpp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1938,21 +1938,40 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
19381938
synchronize();
19391939
return result;
19401940
} else {
1941-
uint fillType = FillBuffer;
1941+
uint fillType = FillBufferAligned;
19421942
size_t globalWorkOffset[3] = {0, 0, 0};
19431943
uint64_t fillSize = size[0] / patternSize;
19441944
size_t globalWorkSize = amd::alignUp(fillSize, 256);
19451945
size_t localWorkSize = 256;
1946-
bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
1946+
uint32_t alignment = (patternSize & 0x7) == 0 ?
1947+
sizeof(uint64_t) :
1948+
(patternSize & 0x3) == 0 ?
1949+
sizeof(uint32_t) :
1950+
(patternSize & 0x1) == 0 ?
1951+
sizeof(uint16_t) : sizeof(uint8_t);
19471952

19481953
// Program kernels arguments for the fill operation
19491954
cl_mem mem = as_cl<amd::Memory>(memory.owner());
1950-
if (dwordAligned) {
1955+
if (alignment == sizeof(uint64_t)) {
1956+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
1957+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
1958+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
1959+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
1960+
} else if (alignment == sizeof(uint32_t)) {
1961+
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
1962+
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
1963+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
1964+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
1965+
} else if (alignment == sizeof(uint16_t)) {
19511966
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
19521967
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
1968+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
1969+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
19531970
} else {
19541971
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
19551972
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
1973+
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
1974+
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
19561975
}
19571976
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
19581977
if (gpuCB == nullptr) {
@@ -1964,15 +1983,15 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
19641983
memcpy(constBuf, pattern, patternSize);
19651984

19661985
mem = as_cl<amd::Memory>(gpuCB->owner());
1967-
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem, constBufOffset);
1986+
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
19681987
uint64_t offset = origin[0];
1969-
if (dwordAligned) {
1970-
patternSize /= sizeof(uint32_t);
1971-
offset /= sizeof(uint32_t);
1972-
}
1973-
setArgument(kernels_[fillType], 3, sizeof(uint32_t), &patternSize);
1974-
setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
1975-
setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
1988+
1989+
patternSize/= alignment;
1990+
offset /= alignment;
1991+
1992+
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
1993+
setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
1994+
setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
19761995

19771996
// Create ND range object for the kernel's execution
19781997
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);

device/rocm/rocblit.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ class KernelBlitManager : public DmaBlitManager {
238238
BlitCopyBufferRectAligned,
239239
BlitCopyBuffer,
240240
BlitCopyBufferAligned,
241-
FillBuffer,
241+
FillBufferAligned,
242242
FillImage,
243243
Scheduler,
244244
GwsInit,
@@ -462,9 +462,11 @@ class KernelBlitManager : public DmaBlitManager {
462462

463463
static const char* BlitName[KernelBlitManager::BlitTotal] = {
464464
"__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer",
465-
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned",
466-
"__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBuffer",
467-
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit"
465+
"__amd_rocclr_copyBufferToImage", "__amd_rocclr_copyBufferRect",
466+
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
467+
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
468+
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
469+
"__amd_rocclr_gwsInit"
468470
};
469471

470472
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,

0 commit comments

Comments
 (0)