@@ -2034,21 +2034,40 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
20342034 synchronize ();
20352035 return result;
20362036 } else {
2037- uint fillType = FillBuffer ;
2037+ uint fillType = FillBufferAligned ;
20382038 size_t globalWorkOffset[3 ] = {0 , 0 , 0 };
20392039 uint64_t fillSize = size[0 ] / patternSize;
20402040 size_t globalWorkSize = amd::alignUp (fillSize, 256 );
20412041 size_t localWorkSize = 256 ;
2042- bool dwordAligned = ((patternSize % sizeof (uint32_t )) == 0 ) ? true : false ;
2042+ uint32_t alignment = (patternSize & 0x7 ) == 0 ?
2043+ sizeof (uint64_t ) :
2044+ (patternSize & 0x3 ) == 0 ?
2045+ sizeof (uint32_t ) :
2046+ (patternSize & 0x1 ) == 0 ?
2047+ sizeof (uint16_t ) : sizeof (uint8_t );
20432048
20442049 // Program kernels arguments for the fill operation
20452050 Memory* mem = &gpuMem (memory);
2046- if (dwordAligned) {
2047- setArgument (kernels_[fillType], 0 , sizeof (cl_mem), NULL );
2051+ if (alignment == sizeof (uint64_t )) {
2052+ setArgument (kernels_[fillType], 0 , sizeof (cl_mem), nullptr );
2053+ setArgument (kernels_[fillType], 1 , sizeof (cl_mem), nullptr );
2054+ setArgument (kernels_[fillType], 2 , sizeof (cl_mem), nullptr );
2055+ setArgument (kernels_[fillType], 3 , sizeof (cl_mem), &mem);
2056+ } else if (alignment == sizeof (uint32_t )) {
2057+ setArgument (kernels_[fillType], 0 , sizeof (cl_mem), nullptr );
2058+ setArgument (kernels_[fillType], 1 , sizeof (cl_mem), nullptr );
2059+ setArgument (kernels_[fillType], 2 , sizeof (cl_mem), &mem);
2060+ setArgument (kernels_[fillType], 3 , sizeof (cl_mem), nullptr );
2061+ } else if (alignment == sizeof (uint16_t )) {
2062+ setArgument (kernels_[fillType], 0 , sizeof (cl_mem), nullptr );
20482063 setArgument (kernels_[fillType], 1 , sizeof (cl_mem), &mem);
2064+ setArgument (kernels_[fillType], 2 , sizeof (cl_mem), nullptr );
2065+ setArgument (kernels_[fillType], 3 , sizeof (cl_mem), nullptr );
20492066 } else {
20502067 setArgument (kernels_[fillType], 0 , sizeof (cl_mem), &mem);
2051- setArgument (kernels_[fillType], 1 , sizeof (cl_mem), NULL );
2068+ setArgument (kernels_[fillType], 1 , sizeof (cl_mem), nullptr );
2069+ setArgument (kernels_[fillType], 2 , sizeof (cl_mem), nullptr );
2070+ setArgument (kernels_[fillType], 3 , sizeof (cl_mem), nullptr );
20522071 }
20532072 Memory* gpuCB = dev ().getGpuMemory (constantBuffer_);
20542073 if (gpuCB == NULL ) {
@@ -2057,15 +2076,15 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
20572076 void * constBuf = gpuCB->map (&gpu (), Resource::WriteOnly);
20582077 memcpy (constBuf, pattern, patternSize);
20592078 gpuCB->unmap (&gpu ());
2060- setArgument (kernels_[fillType], 2 , sizeof (cl_mem), &gpuCB);
2079+ setArgument (kernels_[fillType], 4 , sizeof (cl_mem), &gpuCB);
20612080 uint64_t offset = origin[0 ];
2062- if (dwordAligned) {
2063- patternSize /= sizeof ( uint32_t ) ;
2064- offset /= sizeof ( uint32_t ) ;
2065- }
2066- setArgument (kernels_[fillType], 3 , sizeof (uint32_t ), &patternSize);
2067- setArgument (kernels_[fillType], 4 , sizeof (offset), &offset);
2068- setArgument (kernels_[fillType], 5 , sizeof (fillSize), &fillSize);
2081+
2082+ patternSize/= alignment ;
2083+ offset /= alignment ;
2084+
2085+ setArgument (kernels_[fillType], 5 , sizeof (uint32_t ), &patternSize);
2086+ setArgument (kernels_[fillType], 6 , sizeof (offset), &offset);
2087+ setArgument (kernels_[fillType], 7 , sizeof (fillSize), &fillSize);
20692088
20702089 // Create ND range object for the kernel's execution
20712090 amd::NDRangeContainer ndrange (1 , globalWorkOffset, &globalWorkSize, &localWorkSize);
0 commit comments