1414namespace vkcompute {
1515namespace api {
1616
17+ /*
18+ * For PackedInt8 memory layouts, ensure that the scalar type used for the
19+ * tensor is kInt8x4. Otherwise, return the original scalar type.
20+ */
21+ vkapi::ScalarType get_effective_scalar_type (
22+ const vkapi::ScalarType dtype,
23+ const utils::GPUMemoryLayout memory_layout) {
24+ vkapi::ScalarType effective_dtype = dtype;
25+ if (utils::is_packed_int8_layout (memory_layout)) {
26+ VK_CHECK_COND (dtype == vkapi::kInt8x4 || dtype == vkapi::kChar );
27+ effective_dtype = vkapi::kInt8x4 ;
28+ }
29+ return effective_dtype;
30+ }
31+
1732/*
1833 * Used to infer the sizes of a tensor that would correspond to a given
1934 * VulkanImage.
@@ -187,6 +202,7 @@ std::vector<int64_t> calculate_padded_sizes(
187202
188203utils::uvec3 calculate_image_extents (
189204 const std::vector<int64_t >& padded_sizes,
205+ const utils::GPUMemoryLayout memory_layout,
190206 const std::vector<int64_t >& axis_map,
191207 const int32_t packed_dim) {
192208 utils::uvec3 extents ({1 , 1 , 1 });
@@ -205,6 +221,26 @@ utils::uvec3 calculate_image_extents(
205221 extents[axis] = utils::safe_downcast<uint32_t >(padded_sizes.at (dim));
206222 }
207223
224+ // For "regular" tensor dtypes, 4 elements along the packed dim are packed
225+ // into one texel (4-component vectorized type). However, for packed int8
226+ // memory layouts, an additional level of packing is employed where 4 int8
227+ // elements are packed into one int32, and then 4 int32 are packed into each
228+ // ivec4 texel.
229+ if (utils::is_packed_int8_layout (memory_layout)) {
230+ // Each int in the ivec4 contains 4 channels. The overall ivec4 contains
231+ // data for a 1Hx4Wx4C block of the input tensor.
232+ if (memory_layout == utils::kPackedInt8_4W4C ) {
233+ VK_CHECK_COND (packed_dim == 2 );
234+ extents[axis_map.at (0 )] = utils::div_up (extents[axis_map.at (0 )], 4u );
235+ }
236+ // Each int in the ivec4 contains 4 elements along the width dim. The
237+ // overall ivec4 contains data for a 4Hx4W block of the input tensor.
238+ else if (memory_layout == utils::kPackedInt8_4H4W ) {
239+ VK_CHECK_COND (packed_dim == 0 );
240+ extents[axis_map.at (1 )] = utils::div_up (extents[axis_map.at (1 )], 4u );
241+ }
242+ }
243+
208244 // axis_map[3] indicates the WHCN index of the dimension used for batch
209245 // concatenation. Thus a double lookup is required to determine the image axis
210246 // used for batch concatenation.
@@ -215,6 +251,7 @@ utils::uvec3 calculate_image_extents(
215251
216252 VK_CHECK_COND (extents[axis_map.at (packed_dim)] % 4 == 0 );
217253 extents[axis_map.at (packed_dim)] /= 4 ;
254+
218255 return extents;
219256}
220257
@@ -247,35 +284,72 @@ utils::uvec3 calculate_logical_limits(
247284 */
248285utils::uvec3 calculate_logical_limits (
249286 const std::vector<int64_t >& sizes,
287+ const utils::GPUMemoryLayout memory_layout,
250288 const std::vector<int64_t >& axis_map,
251289 const int32_t packed_dim) {
252290 return calculate_logical_limits (
253291 calculate_image_extents (
254- calculate_padded_sizes (sizes, packed_dim), axis_map, packed_dim),
292+ calculate_padded_sizes (sizes, packed_dim),
293+ memory_layout,
294+ axis_map,
295+ packed_dim),
255296 axis_map);
256297}
257298
258299int64_t calculate_gpu_buffer_numel (
300+ const std::vector<int64_t >& sizes,
301+ const utils::GPUMemoryLayout memory_layout,
302+ const vkapi::ScalarType dtype) {
303+ size_t numel;
304+
305+ // Mirrors the logic in calculate_image_extents for packed int8 memory layouts
306+ if (dtype == vkapi::kInt8x4 ) {
307+ VK_CHECK_COND (utils::is_packed_int8_layout (memory_layout));
308+ std::vector<int64_t > blocks_in_dim =
309+ flip_and_unsqueeze<int64_t >(sizes, kTensorSizes , 0 );
310+ // Each ivec4 contains data for a 1Hx4Wx4C block of the input
311+ if (memory_layout == utils::kPackedInt8_4W4C ) {
312+ blocks_in_dim[0 ] = utils::div_up_4 (blocks_in_dim[2 ]);
313+ blocks_in_dim[2 ] = utils::div_up_4 (blocks_in_dim[2 ]);
314+ }
315+ // Each ivec4 contains data for a 4Hx4W block of the input
316+ else if (memory_layout == utils::kPackedInt8_4H4W ) {
317+ blocks_in_dim[0 ] = utils::div_up_4 (blocks_in_dim[0 ]);
318+ blocks_in_dim[1 ] = utils::div_up_4 (blocks_in_dim[1 ]);
319+ } else {
320+ VK_THROW (" Unhandled packed int8 memory layout!" );
321+ }
322+ // Each block is represented as an ivec4, and the base dtype of the buffer
323+ // is int. Therefore, need to multiply the number of blocks by 4 to obtain
324+ // the number of int elements in the data buffer.
325+ numel = utils::multiply_integers (blocks_in_dim) * 4 ;
326+ }
327+ // Case for "regular" dtypes/memory layouts
328+ else {
329+ numel = utils::multiply_integers (sizes);
330+
331+ // For 8-bit types, align to the next multiple of 4. For devices that do not
332+ // support 8-bit storage buffers, the tensor data will be interpreted as an
333+ // array of int32 instead.
334+ if (vkapi::element_size (dtype) == 1 ) {
335+ numel = utils::align_up_4 (numel);
336+ }
337+ }
338+ return numel;
339+ }
340+
341+ int64_t calculate_staging_or_gpu_buffer_numel (
259342 Context* const context,
260343 const std::vector<int64_t >& sizes,
261344 const utils::uvec3 image_extents,
262345 const utils::StorageType storage_type,
346+ const utils::GPUMemoryLayout memory_layout,
263347 const vkapi::ScalarType dtype) {
264348 // For texture backed tensors, simply multiply the total number of texels by 4
265349 if (storage_type != utils::kBuffer ) {
266350 return image_extents[0 ] * image_extents[1 ] * image_extents[2 ] * 4 ;
267351 }
268- const bool is_int8 = dtype == vkapi::kChar ;
269- const bool int8_supported =
270- context->adapter_ptr ()->has_full_int8_buffers_support ();
271- const size_t numel = utils::multiply_integers (sizes);
272- // For int8 tensors, if the device does not support int8 buffers, then int32
273- // is used instead to represent the buffer data. Therefore the number of
274- // elements in the buffer is aligned to the next multiple of 4.
275- if (is_int8 && int8_supported) {
276- return utils::align_up_4 (numel);
277- }
278- return numel;
352+ return calculate_gpu_buffer_numel (sizes, memory_layout, dtype);
279353}
280354
281355template <typename T, typename = std::enable_if_t <std::is_integral<T>::value>>
@@ -332,10 +406,12 @@ vkapi::VulkanImage allocate_image(
332406 Context* const context_ptr,
333407 utils::uvec3& image_extents,
334408 const utils::StorageType storage_type,
335- const VkFormat image_format ,
409+ const vkapi::ScalarType dtype ,
336410 const bool allocate_memory) {
337411 vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr ();
338412
413+ const VkFormat image_format = vkcompute::vkapi::to_vkformat (dtype);
414+
339415 vkapi::ImageSampler::Properties sampler_props{
340416 VK_FILTER_NEAREST,
341417 VK_SAMPLER_MIPMAP_MODE_NEAREST,
@@ -420,6 +496,7 @@ vkapi::VulkanBuffer allocate_buffer(
420496vTensorStorage::vTensorStorage (
421497 Context* const context,
422498 const utils::StorageType storage_type,
499+ const utils::GPUMemoryLayout memory_layout,
423500 const std::vector<int64_t >& axis_map,
424501 const int32_t packed_dim,
425502 const std::vector<int64_t >& sizes,
@@ -429,20 +506,22 @@ vTensorStorage::vTensorStorage(
429506 storage_type_{storage_type},
430507 image_extents_ (calculate_image_extents(
431508 calculate_padded_sizes (sizes, packed_dim),
509+ memory_layout,
432510 axis_map,
433511 packed_dim)),
434- buffer_length_{calculate_gpu_buffer_numel (
512+ buffer_length_{calculate_staging_or_gpu_buffer_numel (
435513 context_,
436514 sizes,
437515 image_extents_,
438516 storage_type,
517+ memory_layout,
439518 dtype)},
440519 buffer_offset_{0 },
441520 image_ (allocate_image(
442521 context_,
443522 image_extents_,
444523 storage_type_,
445- to_vkformat ( dtype) ,
524+ dtype,
446525 allocate_memory)),
447526 buffer_(allocate_buffer(
448527 context_,
@@ -553,7 +632,7 @@ vTensor::vTensor(
553632 const utils::GPUMemoryLayout memory_layout,
554633 const bool allocate_memory,
555634 const utils::AxisMapLayout axis_map_layout)
556- : dtype_(dtype),
635+ : dtype_(get_effective_scalar_type( dtype, memory_layout) ),
557636 // Calculate tensor metadata
558637 sizes_(sizes.begin(), sizes.end()),
559638 packed_dim_(utils::to_packed_dim<int32_t >(memory_layout)),
@@ -576,6 +655,7 @@ vTensor::vTensor(
576655 storage_(std::make_shared<vTensorStorage>(
577656 context,
578657 storage_type,
658+ memory_layout,
579659 axis_map_,
580660 packed_dim_,
581661 sizes,
@@ -785,6 +865,16 @@ vkapi::VulkanBuffer& vTensor::buffer(
785865}
786866
787867utils::GPUMemoryLayout vTensor::estimate_memory_layout () const {
868+ if (dtype_ == vkapi::kInt8x4 ) {
869+ switch (packed_dim_) {
870+ case WHCN::kChannelsDim :
871+ return utils::kPackedInt8_4W4C ;
872+ case WHCN::kWidthDim :
873+ return utils::kPackedInt8_4H4W ;
874+ default :
875+ VK_THROW (" Invalid packed dim for Tensor with kInt8x4 type" );
876+ }
877+ }
788878 switch (packed_dim_) {
789879 case WHCN::kWidthDim :
790880 return utils::kWidthPacked ;
@@ -914,8 +1004,8 @@ void vTensor::update_metadata() {
9141004 flip_and_unsqueeze_ivec4 (dim_order_, kTensorDimOrder , numel_);
9151005 uniform_data_->strides_v =
9161006 flip_and_unsqueeze_ivec4 (strides_, kTensorStrides , numel_);
917- uniform_data_->logical_limits .limits =
918- calculate_logical_limits ( sizes_, axis_map_, packed_dim_);
1007+ uniform_data_->logical_limits .limits = calculate_logical_limits (
1008+ sizes_, estimate_memory_layout () , axis_map_, packed_dim_);
9191009
9201010 if (sizes_uniform_offset_ != kUniformOffsetUnset ) {
9211011 uniforms_.update (uniform_data_->sizes_v , sizes_uniform_offset_);
@@ -942,11 +1032,15 @@ void vTensor::update_metadata() {
9421032}
9431033
9441034void vTensor::check_sizes (const std::vector<int64_t >& sizes) const {
1035+ utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout ();
9451036 if (storage_type () != utils::kBuffer ) {
9461037 // For texture storage check that the current texture is large enough for
9471038 // the new sizes of the tensor.
9481039 utils::uvec3 virtual_extents = calculate_image_extents (
949- calculate_padded_sizes (sizes_, packed_dim_), axis_map_, packed_dim_);
1040+ calculate_padded_sizes (sizes_, packed_dim_),
1041+ est_memory_layout,
1042+ axis_map_,
1043+ packed_dim_);
9501044
9511045 bool valid_resize = virtual_extents[0 ] <= storage_->image_extents_ [0 ];
9521046 valid_resize =
@@ -958,9 +1052,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
9581052 valid_resize,
9591053 " tensor sizes requires a larger texture than the current one." );
9601054 } else {
961- // For buffer storage check that the current buffer is large enough for the
962- // new sizes of the tensor.
963- int64_t numel = utils::multiply_integers (sizes);
1055+ // For buffer storage check that the current buffer is large enough for
1056+ // the new sizes of the tensor.
1057+ int64_t numel =
1058+ calculate_gpu_buffer_numel (sizes_, est_memory_layout, dtype_);
9641059 bool valid_resize =
9651060 numel + storage_->buffer_offset_ <= storage_->buffer_length_ ;
9661061 VK_CHECK_COND (
0 commit comments