Skip to content

Commit e354592

Browse files
committed
[CUDA][Bindless] Address USM normalized type image creation failure and functionality
The specialised formats, CU_AD_FORMAT_{U|S}NORM_INT{8|16}X{1|2|4}, used to specify a format that is normalized, cannot be applied to images backed by USM memory, causing a failure. This commit addresses that issue by using the standard integer formats and making sure the CU_TRSF_READ_AS_INTEGER flag is set appropriately. Another issue fixed here is that users with CUDA toolkits pre 11.5 are unable to enable normalized types due to the specialised formats being introduced in CUDA 11.5.
1 parent d357964 commit e354592

File tree

2 files changed

+48
-124
lines changed

2 files changed

+48
-124
lines changed

source/adapters/cuda/image.cpp

Lines changed: 43 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -65,101 +65,59 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order,
6565
/// format if not nullptr.
6666
/// /param return_pixel_size_bytes will be set to the pixel
6767
/// byte size if not nullptr.
68+
/// /param return_normalized_dtype_flag will be set if the
69+
/// data type is normalized if not nullptr.
6870
ur_result_t
6971
urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type,
7072
ur_image_channel_order_t image_channel_order,
7173
CUarray_format *return_cuda_format,
72-
size_t *return_pixel_size_bytes) {
74+
size_t *return_pixel_size_bytes,
75+
unsigned int *return_normalized_dtype_flag) {
7376

74-
CUarray_format cuda_format;
77+
CUarray_format cuda_format = CU_AD_FORMAT_UNSIGNED_INT8;
7578
size_t pixel_size_bytes = 0;
7679
unsigned int num_channels = 0;
80+
unsigned int normalized_dtype_flag = 0;
7781
UR_CHECK_ERROR(urCalculateNumChannels(image_channel_order, &num_channels));
7882

7983
switch (image_channel_type) {
80-
#define CASE(FROM, TO, SIZE) \
84+
#define CASE(FROM, TO, SIZE, NORM) \
8185
case FROM: { \
8286
cuda_format = TO; \
8387
pixel_size_bytes = SIZE * num_channels; \
88+
normalized_dtype_flag = NORM; \
8489
break; \
8590
}
8691

87-
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1)
88-
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1)
89-
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2)
90-
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2)
91-
CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2)
92-
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4)
93-
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4)
94-
CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4)
92+
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1, 0)
93+
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1, 0)
94+
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2,
95+
0)
96+
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2, 0)
97+
CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2, 0)
98+
CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4,
99+
0)
100+
CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4, 0)
101+
CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4, 0)
102+
CASE(UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1, 1)
103+
CASE(UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, CU_AD_FORMAT_SIGNED_INT8, 1, 1)
104+
CASE(UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2, 1)
105+
CASE(UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, CU_AD_FORMAT_SIGNED_INT16, 2, 1)
95106

96107
#undef CASE
97108
default:
98109
break;
99110
}
100111

101-
// These new formats were brought in in CUDA 11.5
102-
#if CUDA_VERSION >= 11050
103-
104-
// If none of the above channel types were passed, check those below
105-
if (pixel_size_bytes == 0) {
106-
107-
// We can't use a switch statement here because these single
108-
// UR_IMAGE_CHANNEL_TYPEs can correspond to multiple [u/s]norm CU_AD_FORMATs
109-
// depending on the number of channels. We use a std::map instead to
110-
// retrieve the correct CUDA format
111-
112-
// map < <channel type, num channels> , <CUDA format, data type byte size> >
113-
const std::map<std::pair<ur_image_channel_type_t, uint32_t>,
114-
std::pair<CUarray_format, uint32_t>>
115-
norm_channel_type_map{
116-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 1},
117-
{CU_AD_FORMAT_UNORM_INT8X1, 1}},
118-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 2},
119-
{CU_AD_FORMAT_UNORM_INT8X2, 2}},
120-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 4},
121-
{CU_AD_FORMAT_UNORM_INT8X4, 4}},
122-
123-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 1},
124-
{CU_AD_FORMAT_SNORM_INT8X1, 1}},
125-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 2},
126-
{CU_AD_FORMAT_SNORM_INT8X2, 2}},
127-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 4},
128-
{CU_AD_FORMAT_SNORM_INT8X4, 4}},
129-
130-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 1},
131-
{CU_AD_FORMAT_UNORM_INT16X1, 2}},
132-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 2},
133-
{CU_AD_FORMAT_UNORM_INT16X2, 4}},
134-
{{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 4},
135-
{CU_AD_FORMAT_UNORM_INT16X4, 8}},
136-
137-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 1},
138-
{CU_AD_FORMAT_SNORM_INT16X1, 2}},
139-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 2},
140-
{CU_AD_FORMAT_SNORM_INT16X2, 4}},
141-
{{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 4},
142-
{CU_AD_FORMAT_SNORM_INT16X4, 8}},
143-
};
144-
145-
try {
146-
auto cuda_format_and_size = norm_channel_type_map.at(
147-
std::make_pair(image_channel_type, num_channels));
148-
cuda_format = cuda_format_and_size.first;
149-
pixel_size_bytes = cuda_format_and_size.second;
150-
} catch (const std::out_of_range &) {
151-
return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
152-
}
153-
}
154-
155-
#endif
156-
157112
if (return_cuda_format) {
158113
*return_cuda_format = cuda_format;
159114
}
160115
if (return_pixel_size_bytes) {
161116
*return_pixel_size_bytes = pixel_size_bytes;
162117
}
118+
if (return_normalized_dtype_flag) {
119+
*return_normalized_dtype_flag = normalized_dtype_flag;
120+
}
163121
return UR_RESULT_SUCCESS;
164122
}
165123

@@ -189,45 +147,6 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format,
189147
UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT);
190148
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_FLOAT,
191149
UR_IMAGE_CHANNEL_TYPE_FLOAT);
192-
#if CUDA_VERSION >= 11050
193-
194-
// Note that the CUDA UNORM and SNORM formats also encode the number of
195-
// channels.
196-
// Since UR does not encode this, we map different CUDA formats to the same
197-
// UR channel type.
198-
// Since this function is only called from `urBindlessImagesImageGetInfoExp`
199-
// which has access to `CUDA_ARRAY3D_DESCRIPTOR`, we can determine the
200-
// number of channels in the calling function.
201-
202-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X1,
203-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT8);
204-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X2,
205-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT8);
206-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X4,
207-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT8);
208-
209-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X1,
210-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT16);
211-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X2,
212-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT16);
213-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X4,
214-
UR_IMAGE_CHANNEL_TYPE_UNORM_INT16);
215-
216-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X1,
217-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT8);
218-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X2,
219-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT8);
220-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X4,
221-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT8);
222-
223-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X1,
224-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT16);
225-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X2,
226-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT16);
227-
CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X4,
228-
UR_IMAGE_CHANNEL_TYPE_SNORM_INT16);
229-
#endif
230-
#undef MAP
231150
default:
232151
return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
233152
}
@@ -236,6 +155,7 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format,
236155
ur_result_t urTextureCreate(ur_sampler_handle_t hSampler,
237156
const ur_image_desc_t *pImageDesc,
238157
const CUDA_RESOURCE_DESC &ResourceDesc,
158+
const unsigned int normalized_dtype_flag,
239159
ur_exp_image_native_handle_t *phRetImage) {
240160

241161
try {
@@ -306,8 +226,9 @@ ur_result_t urTextureCreate(ur_sampler_handle_t hSampler,
306226

307227
// CUDA default promotes 8-bit and 16-bit integers to float between [0,1]
308228
// This flag prevents this behaviour.
309-
ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER;
310-
229+
if (!normalized_dtype_flag) {
230+
ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER;
231+
}
311232
// Cubemap attributes
312233
ur_exp_sampler_cubemap_filter_mode_t CubemapFilterModeProp =
313234
hSampler->getCubemapFilterMode();
@@ -413,9 +334,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
413334
UR_CHECK_ERROR(urCalculateNumChannels(pImageFormat->channelOrder,
414335
&array_desc.NumChannels));
415336

416-
UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType,
417-
pImageFormat->channelOrder,
418-
&array_desc.Format, nullptr));
337+
UR_CHECK_ERROR(urToCudaImageChannelFormat(
338+
pImageFormat->channelType, pImageFormat->channelOrder, &array_desc.Format,
339+
nullptr, nullptr));
419340

420341
array_desc.Flags = 0; // No flags required
421342
array_desc.Width = pImageDesc->width;
@@ -534,7 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
534455
size_t PixelSizeBytes;
535456
UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType,
536457
pImageFormat->channelOrder, &format,
537-
&PixelSizeBytes));
458+
&PixelSizeBytes, nullptr));
538459

539460
try {
540461

@@ -579,9 +500,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
579500

580501
CUarray_format format;
581502
size_t PixelSizeBytes;
582-
UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType,
583-
pImageFormat->channelOrder, &format,
584-
&PixelSizeBytes));
503+
unsigned int normalized_dtype_flag;
504+
UR_CHECK_ERROR(urToCudaImageChannelFormat(
505+
pImageFormat->channelType, pImageFormat->channelOrder, &format,
506+
&PixelSizeBytes, &normalized_dtype_flag));
585507

586508
try {
587509
CUDA_RESOURCE_DESC image_res_desc = {};
@@ -630,8 +552,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
630552
return UR_RESULT_ERROR_INVALID_VALUE;
631553
}
632554

633-
UR_CHECK_ERROR(
634-
urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage));
555+
UR_CHECK_ERROR(urTextureCreate(hSampler, pImageDesc, image_res_desc,
556+
normalized_dtype_flag, phImage));
635557

636558
} catch (ur_result_t Err) {
637559
return Err;
@@ -667,7 +589,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
667589
// later.
668590
UR_CHECK_ERROR(urToCudaImageChannelFormat(pSrcImageFormat->channelType,
669591
pSrcImageFormat->channelOrder,
670-
nullptr, &PixelSizeBytes));
592+
nullptr, &PixelSizeBytes, nullptr));
671593

672594
try {
673595
ScopedContext Active(hQueue->getDevice());
@@ -1146,8 +1068,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
11461068
urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels));
11471069

11481070
CUarray_format format;
1149-
UR_CHECK_ERROR(urToCudaImageChannelFormat(
1150-
pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr));
1071+
UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType,
1072+
pImageFormat->channelOrder, &format,
1073+
nullptr, nullptr));
11511074

11521075
try {
11531076
ScopedContext Active(hDevice);

source/adapters/cuda/image.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@ ur_result_t
2121
urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type,
2222
ur_image_channel_order_t image_channel_order,
2323
CUarray_format *return_cuda_format,
24-
size_t *return_pixel_types_size_bytes);
24+
size_t *return_pixel_types_size_bytes,
25+
unsigned int *return_normalized_dtype_flag);
2526

2627
ur_result_t
2728
cudaToUrImageChannelFormat(CUarray_format cuda_format,
2829
ur_image_channel_type_t *return_image_channel_type);
2930

30-
ur_result_t urTextureCreate(ur_context_handle_t hContext,
31-
ur_sampler_desc_t SamplerDesc,
31+
ur_result_t urTextureCreate(ur_sampler_handle_t hSampler,
3232
const ur_image_desc_t *pImageDesc,
33-
CUDA_RESOURCE_DESC ResourceDesc,
33+
const CUDA_RESOURCE_DESC &ResourceDesc,
34+
const unsigned int normalized_dtype_flag,
3435
ur_exp_image_native_handle_t *phRetImage);

0 commit comments

Comments
 (0)