Skip to content

Commit 1ca431c

Browse files
author
ssjia
committed
[ET-VK] Add kInt8x4 dtype and GPUMemoryLayouts for packed quantized tensors
## Motivation Lay the foundations for being able to execute statically quantized CNNs with ET-VK. Unlike with dynamic quantization, static quantization allows the output of quantized operators to stay in integer representation and be fed directly to the next quantized operator. ## Context Typically, int8 quantized tensors can be represented by simply having the tensor use the int8 data type. While this is possible in ET-VK, in practice quantized operators expect int8 quantized tensors to be packed so that 16 8-bit values are packed into each `ivec4`, such that quantized int8 tensors will load/store with a granularity of 16 elements. The reason for this is twofold: * Support for shader int8 / storage buffer int8 extension is not guaranteed, meaning some devices do not allow using int8 types in shaders * We have found that load/store from storage buffers/textures that use int8 data types sometimes results in worse memory load performance, due to vectorized load/store instructions not being used. Therefore, in ET-VK we need a way to mark that a quantized tensor should 1. Use int32 as the underlying data type for the storage buffer/texture 2. Account for the block-packing that may be used ## Changes First, introduce the `Int8x4` dtype that can be used for packed int8 tensors. This dtype is functionally the same as `Int`, but denotes that each int32 actually contains 4 packed 8-bit values. Second, introduce new memory layouts: `kPackedInt8_4W4C` and `kPackedInt8_4H4W`. The former will be used for convolution, whil the latter will be used for matrix multiplication. See the inline comments for more details about these memory layouts. Then, update `QuantizedConvolution.cpp` and `QuantizedLinear.cpp` to use the new data type and memory layouts for the packed int8 input tensor. Differential Revision: [D82542336](https://our.internmc.facebook.com/intern/diff/D82542336/) [ghstack-poisoned]
1 parent 03f436a commit 1ca431c

File tree

7 files changed

+184
-41
lines changed

7 files changed

+184
-41
lines changed

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 117 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@
1414
namespace vkcompute {
1515
namespace api {
1616

17+
/*
18+
* For PackedInt8 memory layouts, ensure that the scalar type used for the
19+
* tensor is kInt8x4. Otherwise, return the original scalar type.
20+
*/
21+
vkapi::ScalarType get_effective_scalar_type(
22+
const vkapi::ScalarType dtype,
23+
const utils::GPUMemoryLayout memory_layout) {
24+
vkapi::ScalarType effective_dtype = dtype;
25+
if (utils::is_packed_int8_layout(memory_layout)) {
26+
VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar);
27+
effective_dtype = vkapi::kInt8x4;
28+
}
29+
return effective_dtype;
30+
}
31+
1732
/*
1833
* Used to infer the sizes of a tensor that would correspond to a given
1934
* VulkanImage.
@@ -187,6 +202,7 @@ std::vector<int64_t> calculate_padded_sizes(
187202

188203
utils::uvec3 calculate_image_extents(
189204
const std::vector<int64_t>& padded_sizes,
205+
const utils::GPUMemoryLayout memory_layout,
190206
const std::vector<int64_t>& axis_map,
191207
const int32_t packed_dim) {
192208
utils::uvec3 extents({1, 1, 1});
@@ -205,6 +221,26 @@ utils::uvec3 calculate_image_extents(
205221
extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
206222
}
207223

224+
// For "regular" tensor dtypes, 4 elements along the packed dim are packed
225+
// into one texel (4-component vectorized type). However, for packed int8
226+
// memory layouts, an additional level of packing is employed where 4 int8
227+
// elements are packed into one int32, and then 4 int32 are packed into each
228+
// ivec4 texel.
229+
if (utils::is_packed_int8_layout(memory_layout)) {
230+
// Each int in the ivec4 contains 4 channels. The overall ivec4 contains
231+
// data for a 1Hx4Wx4C block of the input tensor.
232+
if (memory_layout == utils::kPackedInt8_4W4C) {
233+
VK_CHECK_COND(packed_dim == 2);
234+
extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
235+
}
236+
// Each int in the ivec4 contains 4 elements along the width dim. The
237+
// overall ivec4 contains data for a 4Hx4W block of the input tensor.
238+
else if (memory_layout == utils::kPackedInt8_4H4W) {
239+
VK_CHECK_COND(packed_dim == 0);
240+
extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
241+
}
242+
}
243+
208244
// axis_map[3] indicates the WHCN index of the dimension used for batch
209245
// concatenation. Thus a double lookup is required to determine the image axis
210246
// used for batch concatenation.
@@ -215,6 +251,7 @@ utils::uvec3 calculate_image_extents(
215251

216252
VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
217253
extents[axis_map.at(packed_dim)] /= 4;
254+
218255
return extents;
219256
}
220257

@@ -247,35 +284,72 @@ utils::uvec3 calculate_logical_limits(
247284
*/
248285
utils::uvec3 calculate_logical_limits(
249286
const std::vector<int64_t>& sizes,
287+
const utils::GPUMemoryLayout memory_layout,
250288
const std::vector<int64_t>& axis_map,
251289
const int32_t packed_dim) {
252290
return calculate_logical_limits(
253291
calculate_image_extents(
254-
calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
292+
calculate_padded_sizes(sizes, packed_dim),
293+
memory_layout,
294+
axis_map,
295+
packed_dim),
255296
axis_map);
256297
}
257298

258299
int64_t calculate_gpu_buffer_numel(
300+
const std::vector<int64_t>& sizes,
301+
const utils::GPUMemoryLayout memory_layout,
302+
const vkapi::ScalarType dtype) {
303+
size_t numel;
304+
305+
// Mirrors the logic in calculate_image_extents for packed int8 memory layouts
306+
if (dtype == vkapi::kInt8x4) {
307+
VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
308+
std::vector<int64_t> blocks_in_dim =
309+
flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
310+
// Each ivec4 contains data for a 1Hx4Wx4C block of the input
311+
if (memory_layout == utils::kPackedInt8_4W4C) {
312+
blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[2]);
313+
blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
314+
}
315+
// Each ivec4 contains data for a 4Hx4W block of the input
316+
else if (memory_layout == utils::kPackedInt8_4H4W) {
317+
blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
318+
blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
319+
} else {
320+
VK_THROW("Unhandled packed int8 memory layout!");
321+
}
322+
// Each block is represented as an ivec4, and the base dtype of the buffer
323+
// is int. Therefore, need to multiply the number of blocks by 4 to obtain
324+
// the number of int elements in the data buffer.
325+
numel = utils::multiply_integers(blocks_in_dim) * 4;
326+
}
327+
// Case for "regular" dtypes/memory layouts
328+
else {
329+
numel = utils::multiply_integers(sizes);
330+
331+
// For 8-bit types, align to the next multiple of 4. For devices that do not
332+
// support 8-bit storage buffers, the tensor data will be interpreted as an
333+
// array of int32 instead.
334+
if (vkapi::element_size(dtype) == 1) {
335+
numel = utils::align_up_4(numel);
336+
}
337+
}
338+
return numel;
339+
}
340+
341+
int64_t calculate_staging_or_gpu_buffer_numel(
259342
Context* const context,
260343
const std::vector<int64_t>& sizes,
261344
const utils::uvec3 image_extents,
262345
const utils::StorageType storage_type,
346+
const utils::GPUMemoryLayout memory_layout,
263347
const vkapi::ScalarType dtype) {
264348
// For texture backed tensors, simply multiply the total number of texels by 4
265349
if (storage_type != utils::kBuffer) {
266350
return image_extents[0] * image_extents[1] * image_extents[2] * 4;
267351
}
268-
const bool is_int8 = dtype == vkapi::kChar;
269-
const bool int8_supported =
270-
context->adapter_ptr()->has_full_int8_buffers_support();
271-
const size_t numel = utils::multiply_integers(sizes);
272-
// For int8 tensors, if the device does not support int8 buffers, then int32
273-
// is used instead to represent the buffer data. Therefore the number of
274-
// elements in the buffer is aligned to the next multiple of 4.
275-
if (is_int8 && int8_supported) {
276-
return utils::align_up_4(numel);
277-
}
278-
return numel;
352+
return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
279353
}
280354

281355
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
@@ -332,10 +406,12 @@ vkapi::VulkanImage allocate_image(
332406
Context* const context_ptr,
333407
utils::uvec3& image_extents,
334408
const utils::StorageType storage_type,
335-
const VkFormat image_format,
409+
const vkapi::ScalarType dtype,
336410
const bool allocate_memory) {
337411
vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
338412

413+
const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype);
414+
339415
vkapi::ImageSampler::Properties sampler_props{
340416
VK_FILTER_NEAREST,
341417
VK_SAMPLER_MIPMAP_MODE_NEAREST,
@@ -420,6 +496,7 @@ vkapi::VulkanBuffer allocate_buffer(
420496
vTensorStorage::vTensorStorage(
421497
Context* const context,
422498
const utils::StorageType storage_type,
499+
const utils::GPUMemoryLayout memory_layout,
423500
const std::vector<int64_t>& axis_map,
424501
const int32_t packed_dim,
425502
const std::vector<int64_t>& sizes,
@@ -429,20 +506,22 @@ vTensorStorage::vTensorStorage(
429506
storage_type_{storage_type},
430507
image_extents_(calculate_image_extents(
431508
calculate_padded_sizes(sizes, packed_dim),
509+
memory_layout,
432510
axis_map,
433511
packed_dim)),
434-
buffer_length_{calculate_gpu_buffer_numel(
512+
buffer_length_{calculate_staging_or_gpu_buffer_numel(
435513
context_,
436514
sizes,
437515
image_extents_,
438516
storage_type,
517+
memory_layout,
439518
dtype)},
440519
buffer_offset_{0},
441520
image_(allocate_image(
442521
context_,
443522
image_extents_,
444523
storage_type_,
445-
to_vkformat(dtype),
524+
dtype,
446525
allocate_memory)),
447526
buffer_(allocate_buffer(
448527
context_,
@@ -553,7 +632,7 @@ vTensor::vTensor(
553632
const utils::GPUMemoryLayout memory_layout,
554633
const bool allocate_memory,
555634
const utils::AxisMapLayout axis_map_layout)
556-
: dtype_(dtype),
635+
: dtype_(get_effective_scalar_type(dtype, memory_layout)),
557636
// Calculate tensor metadata
558637
sizes_(sizes.begin(), sizes.end()),
559638
packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
@@ -576,6 +655,7 @@ vTensor::vTensor(
576655
storage_(std::make_shared<vTensorStorage>(
577656
context,
578657
storage_type,
658+
memory_layout,
579659
axis_map_,
580660
packed_dim_,
581661
sizes,
@@ -785,6 +865,16 @@ vkapi::VulkanBuffer& vTensor::buffer(
785865
}
786866

787867
utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
868+
if (dtype_ == vkapi::kInt8x4) {
869+
switch (packed_dim_) {
870+
case WHCN::kChannelsDim:
871+
return utils::kPackedInt8_4W4C;
872+
case WHCN::kWidthDim:
873+
return utils::kPackedInt8_4H4W;
874+
default:
875+
VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
876+
}
877+
}
788878
switch (packed_dim_) {
789879
case WHCN::kWidthDim:
790880
return utils::kWidthPacked;
@@ -914,8 +1004,8 @@ void vTensor::update_metadata() {
9141004
flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
9151005
uniform_data_->strides_v =
9161006
flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
917-
uniform_data_->logical_limits.limits =
918-
calculate_logical_limits(sizes_, axis_map_, packed_dim_);
1007+
uniform_data_->logical_limits.limits = calculate_logical_limits(
1008+
sizes_, estimate_memory_layout(), axis_map_, packed_dim_);
9191009

9201010
if (sizes_uniform_offset_ != kUniformOffsetUnset) {
9211011
uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
@@ -942,11 +1032,15 @@ void vTensor::update_metadata() {
9421032
}
9431033

9441034
void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
1035+
utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
9451036
if (storage_type() != utils::kBuffer) {
9461037
// For texture storage check that the current texture is large enough for
9471038
// the new sizes of the tensor.
9481039
utils::uvec3 virtual_extents = calculate_image_extents(
949-
calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
1040+
calculate_padded_sizes(sizes_, packed_dim_),
1041+
est_memory_layout,
1042+
axis_map_,
1043+
packed_dim_);
9501044

9511045
bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
9521046
valid_resize =
@@ -958,9 +1052,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
9581052
valid_resize,
9591053
"tensor sizes requires a larger texture than the current one.");
9601054
} else {
961-
// For buffer storage check that the current buffer is large enough for the
962-
// new sizes of the tensor.
963-
int64_t numel = utils::multiply_integers(sizes);
1055+
// For buffer storage check that the current buffer is large enough for
1056+
// the new sizes of the tensor.
1057+
int64_t numel =
1058+
calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
9641059
bool valid_resize =
9651060
numel + storage_->buffer_offset_ <= storage_->buffer_length_;
9661061
VK_CHECK_COND(

backends/vulkan/runtime/api/containers/Tensor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class vTensorStorage final {
9999
vTensorStorage(
100100
Context* context,
101101
const utils::StorageType storage_type,
102+
const utils::GPUMemoryLayout memory_layout,
102103
const std::vector<int64_t>& axis_map,
103104
const int32_t packed_dim,
104105
const std::vector<int64_t>& sizes,

backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -564,16 +564,12 @@ void quantized_conv2d_impl(
564564
ValueRef packed_weight_sums = prepack_standard(
565565
graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
566566

567-
// Allocate quantized + packed im2col matrix for input
568-
const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0));
569-
const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1));
570-
571567
TmpTensor input_int_im2col(
572568
&graph,
573-
{num_blocks_M, num_blocks_K * 4},
574-
vkapi::kInt,
569+
input_im2col_sizes,
570+
vkapi::kInt8x4,
575571
utils::kBuffer,
576-
utils::kWidthPacked);
572+
utils::kPackedInt8_4H4W);
577573

578574
add_quantize_and_pack_im2col_node(
579575
graph,

backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -802,20 +802,12 @@ void quantized_linear_impl(
802802
graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
803803

804804
// Allocate temporary tensor to store quantized and packed input
805-
806-
int64_t num_blocks_M, num_blocks_K;
807-
std::tie(num_blocks_M, num_blocks_K) =
808-
get_quantized_input_num_blocks(graph, fp_input);
809-
810-
const int64_t int_input_height = num_blocks_M;
811-
const int64_t int_input_width = num_blocks_K * 4;
812-
813805
TmpTensor packed_int_input(
814806
&graph,
815-
{int_input_height, int_input_width},
816-
vkapi::kInt,
807+
graph.sizes_of(fp_input),
808+
vkapi::kInt8x4,
817809
utils::kBuffer,
818-
utils::kWidthPacked);
810+
utils::kPackedInt8_4H4W);
819811

820812
// Non dynamically quantized input case
821813
if (!input_quant_config.is_dynamic) {
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
10+
11+
namespace vkcompute {
12+
namespace utils {
13+
14+
bool is_packed_int8_layout(const GPUMemoryLayout layout) {
15+
switch (layout) {
16+
case kPackedInt8_4W4C:
17+
case kPackedInt8_4H4W:
18+
return true;
19+
default:
20+
return false;
21+
}
22+
}
23+
24+
} // namespace utils
25+
} // namespace vkcompute

0 commit comments

Comments
 (0)