@@ -769,29 +769,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
769769 return Result;
770770}
771771
772- // HIP has no memset functions that allow setting values more than 4 bytes. UR
773- // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
774- // more than 4 bytes. We must break up the pattern into 1 byte values, and set
775- // the buffer using multiple strided calls. The first 4 patterns are set using
776- // hipMemsetD32Async then all subsequent 1 byte patterns are set using
777- // hipMemset2DAsync which is called for each pattern.
778- ur_result_t commonMemSetLargePattern (hipStream_t Stream, uint32_t PatternSize,
779- size_t Size, const void *pPattern,
780- hipDeviceptr_t Ptr) {
781- // Calculate the number of patterns, stride, number of times the pattern
782- // needs to be applied, and the number of times the first 32 bit pattern
783- // needs to be applied.
772+ static inline void memsetRemainPattern (hipStream_t Stream, uint32_t PatternSize,
773+ size_t Size, const void *pPattern,
774+ hipDeviceptr_t Ptr) {
775+
776+ // Calculate the number of patterns, stride and the number of times the
777+ // pattern needs to be applied.
784778 auto NumberOfSteps = PatternSize / sizeof (uint8_t );
785779 auto Pitch = NumberOfSteps * sizeof (uint8_t );
786780 auto Height = Size / NumberOfSteps;
787- auto Count32 = Size / sizeof (uint32_t );
788781
789- // Get 4-byte chunk of the pattern and call hipMemsetD32Async
790- auto Value = *(static_cast <const uint32_t *>(pPattern));
791- UR_CHECK_ERROR (hipMemsetD32Async (Ptr, Value, Count32, Stream));
792782 for (auto step = 4u ; step < NumberOfSteps; ++step) {
793783 // take 1 byte of the pattern
794- Value = *(static_cast <const uint8_t *>(pPattern) + step);
784+ auto Value = *(static_cast <const uint8_t *>(pPattern) + step);
795785
796786 // offset the pointer to the part of the buffer we want to write to
797787 auto OffsetPtr = reinterpret_cast <void *>(reinterpret_cast <uint8_t *>(Ptr) +
@@ -801,6 +791,55 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
801791 UR_CHECK_ERROR (hipMemset2DAsync (OffsetPtr, Pitch, Value, sizeof (uint8_t ),
802792 Height, Stream));
803793 }
794+ }
795+
796+ // HIP has no memset functions that allow setting values more than 4 bytes. UR
797+ // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
798+ // more than 4 bytes. We must break up the pattern into 1 byte values, and set
799+ // the buffer using multiple strided calls. The first 4 patterns are set using
800+ // hipMemsetD32Async then all subsequent 1 byte patterns are set using
801+ // hipMemset2DAsync which is called for each pattern.
802+ ur_result_t commonMemSetLargePattern (hipStream_t Stream, uint32_t PatternSize,
803+ size_t Size, const void *pPattern,
804+ hipDeviceptr_t Ptr) {
805+
806+ // Get 4-byte chunk of the pattern and call hipMemsetD32Async
807+ auto Count32 = Size / sizeof (uint32_t );
808+ auto Value = *(static_cast <const uint32_t *>(pPattern));
809+ UR_CHECK_ERROR (hipMemsetD32Async (Ptr, Value, Count32, Stream));
810+
811+ // There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
812+ // to behave incorrectly when acting on host pinned memory.
813+ // In such a case, the memset operation is partially emulated with memcpy.
814+ #if HIP_VERSION_MAJOR < 6
815+ hipPointerAttribute_t ptrAttribs{};
816+ UR_CHECK_ERROR (hipPointerGetAttributes (&ptrAttribs, (const void *)Ptr));
817+
818+ // The hostPointer attribute is non-null also for shared memory allocations.
819+ // To make sure that this workaround only executes for host pinned memory, we
820+ // need to check that isManaged attribute is false.
821+ if (ptrAttribs.hostPointer && !ptrAttribs.isManaged ) {
822+ const auto NumOfCopySteps = Size / PatternSize;
823+ const auto Offset = sizeof (uint32_t );
824+ const auto LeftPatternSize = PatternSize - Offset;
825+ const auto OffsetPatternPtr = reinterpret_cast <const void *>(
826+ reinterpret_cast <const uint8_t *>(pPattern) + Offset);
827+
828+ // Loop through the memory area to memset, advancing each time by the
829+ // PatternSize and memcpy the left over pattern bits.
830+ for (uint32_t i = 0 ; i < NumOfCopySteps; ++i) {
831+ auto OffsetDstPtr = reinterpret_cast <void *>(
832+ reinterpret_cast <uint8_t *>(Ptr) + Offset + i * PatternSize);
833+ UR_CHECK_ERROR (hipMemcpyAsync (OffsetDstPtr, OffsetPatternPtr,
834+ LeftPatternSize, hipMemcpyHostToHost,
835+ Stream));
836+ }
837+ } else {
838+ memsetRemainPattern (Stream, PatternSize, Size, pPattern, Ptr);
839+ }
840+ #else
841+ memsetRemainPattern (Stream, PatternSize, Size, pPattern, Ptr);
842+ #endif
804843 return UR_RESULT_SUCCESS;
805844}
806845
0 commit comments