@@ -256,9 +256,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
256256 return UR_RESULT_SUCCESS;
257257 }
258258
259- // OpenCL only supports pattern sizes as large as the largest CL type
260- // (double16/long16 - 128 bytes), anything larger we need to do on the host
261- // side and copy it into the target allocation.
259+ // OpenCL only supports pattern sizes which are powers of 2 and are as large
260+ // as the largest CL type (double16/long16 - 128 bytes), anything larger or
261+ // not a power of 2, we need to do on the host side and copy it into the
262+ // target allocation.
262263 clHostMemAllocINTEL_fn HostMemAlloc = nullptr ;
263264 UR_RETURN_ON_FAILURE (cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
264265 CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache ,
@@ -275,14 +276,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
275276 cl_ext::MemBlockingFreeName, &USMFree));
276277
277278 cl_int ClErr = CL_SUCCESS;
278- auto HostBuffer = static_cast <uint64_t *>(
279+ auto HostBuffer = static_cast <unsigned char *>(
279280 HostMemAlloc (CLContext, nullptr , size, 0 , &ClErr));
280281 CL_RETURN_ON_FAILURE (ClErr);
281282
282- auto NumValues = size / sizeof ( uint64_t ) ;
283- auto NumChunks = patternSize / sizeof ( uint64_t );
284- for ( size_t i = 0 ; i < NumValues; i++) {
285- HostBuffer[i] = static_cast < const uint64_t *>( pPattern)[i % NumChunks] ;
283+ auto NumChunks = size / patternSize ;
284+ for ( size_t i = 0 ; i < NumChunks; i++) {
285+ auto Dest = HostBuffer + i * patternSize;
286+ memcpy (Dest, pPattern, patternSize) ;
286287 }
287288
288289 cl_event CopyEvent = nullptr ;
0 commit comments