Skip to content

Commit 4d095f9

Browse files
SS-JIAssjia
andauthored
[ET-VK][ez] Consolidate tensor metadata calculation + buffer binding code (#13594)
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * #13597 * #13596 * #13595 * __->__ #13594 * #13593 * #13600 * #13599 * #13598 Differential Revision: [D80800085](https://our.internmc.facebook.com/intern/diff/D80800085) Co-authored-by: ssjia <[email protected]>
1 parent 818d095 commit 4d095f9

File tree

3 files changed

+255
-201
lines changed

3 files changed

+255
-201
lines changed

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 75 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
namespace vkcompute {
1515
namespace api {
1616

17+
/*
18+
* Used to infer the sizes of a tensor that would correspond to a given
19+
* VulkanImage.
20+
*/
1721
std::vector<int64_t> calculate_sizes(
1822
const vkapi::VulkanImage& image,
1923
const utils::GPUMemoryLayout memory_layout) {
@@ -143,58 +147,19 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
143147
return sum == n * (n + 1) / 2;
144148
}
145149

146-
/*
147-
* Applies the following transformations to a tensor's dim_order vector:
148-
* 1. Reverse the order of elements so that the fastest moving dimensions are
149-
* first.
150-
* 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
151-
* width dimension, 1 represents the height dimension, and 2 represents the
152-
* channels dimension.
153-
* 3. Unsqueeze the dim_order vector to the next multiple of 4.
154-
155-
* These transformations make it easier to use the dim order in a compute shader
156-
*/
157-
std::vector<int64_t> create_whcn_dim_order(
158-
const std::vector<int64_t>& dim_order) {
159-
size_t ndim = dim_order.size();
160-
std::vector<int64_t> whcn_order(ndim);
161-
162-
// Convert from NCHW to WHCN index, and flip the dim order so that the fastest
163-
// moving dimension is first.
164-
// example: { 1, 2, 0} -> { 2, 0, 1}
165-
// {height, width, channels} -> {channels, width, height}
166-
for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
167-
++whcn_i, --nchw_i) {
168-
whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
169-
}
170-
171-
// Unsqueeze to the next multiple of 4
172-
size_t ndim_up4 = utils::align_up_4(ndim);
173-
whcn_order.resize(ndim_up4);
174-
175-
// Append unsqueezed dimensions
176-
for (size_t i = ndim; i < ndim_up4; ++i) {
177-
whcn_order.at(i) = i;
178-
}
179-
180-
return whcn_order;
181-
}
182-
183-
std::vector<int64_t> unsqueeze_strides(
184-
const std::vector<int64_t>& strides,
185-
const int64_t numel) {
186-
const size_t ndim = strides.size();
187-
const size_t ndim_up4 = utils::align_up_4(strides.size());
188-
std::vector<int64_t> unsqueezed_strides(ndim_up4);
189-
for (int32_t i = 1; i <= ndim; ++i) {
190-
int64_t dim_stride = strides.at(ndim - i);
191-
unsqueezed_strides.at(ndim_up4 - i) = dim_stride;
192-
}
193-
194-
for (int32_t i = ndim + 1; i <= ndim_up4; ++i) {
195-
unsqueezed_strides.at(ndim_up4 - i) = numel;
196-
}
197-
return unsqueezed_strides;
150+
utils::ivec4 flip_and_unsqueeze_ivec4(
151+
const std::vector<int64_t>& tensor_metadata,
152+
const vTensor::Attribute metadata_type,
153+
const size_t numel) {
154+
VK_CHECK_COND(tensor_metadata.size() <= 4);
155+
std::vector<int32_t> flipped_metadata =
156+
flip_and_unsqueeze<int32_t>(tensor_metadata, metadata_type, numel);
157+
return {
158+
flipped_metadata.at(0),
159+
flipped_metadata.at(1),
160+
flipped_metadata.at(2),
161+
flipped_metadata.at(3),
162+
};
198163
}
199164

200165
std::vector<int64_t> calculate_padded_sizes(
@@ -309,7 +274,8 @@ int64_t calculate_gpu_buffer_numel(
309274
return numel;
310275
}
311276

312-
int32_t pack_into_int32(const std::vector<int64_t>& vec, const int32_t extra) {
277+
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
278+
int32_t pack_into_int32(const std::vector<T>& vec, const int32_t extra) {
313279
int32_t packed = static_cast<int32_t>(
314280
vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
315281
(extra << 16));
@@ -322,22 +288,24 @@ int32_t create_hashed_layout(
322288
const int32_t packed_dim,
323289
const utils::StorageType storage_type) {
324290
if (storage_type == utils::kBuffer) {
325-
return pack_into_int32(create_whcn_dim_order(dim_order), 0);
291+
return pack_into_int32(
292+
flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, 0), 0);
326293
}
327294
return pack_into_int32(axis_map, packed_dim);
328295
}
329296

330297
size_t calculate_max_ubo_nbytes(
331-
const size_t nbytes_per_ubo,
298+
const size_t min_nbytes_per_ubo,
332299
const utils::StorageType storage_type) {
333-
// For texture backed tensors, the metadata fields needed are:
334-
// sizes, logical limits
335-
size_t max_metadata_field_count = 2u;
300+
size_t ivec4_ubo_nbytes = utils::align_up(size_t(16), min_nbytes_per_ubo);
301+
size_t uvec3_ubo_nbytes = utils::align_up(size_t(12), min_nbytes_per_ubo);
302+
size_t int32_ubo_nbytes = utils::align_up(size_t(4), min_nbytes_per_ubo);
336303
if (storage_type == utils::kBuffer) {
337304
// sizes, strides, dim order, numel
338-
max_metadata_field_count = 4u;
305+
return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes;
339306
}
340-
return max_metadata_field_count * nbytes_per_ubo;
307+
// sizes, logical limits
308+
return ivec4_ubo_nbytes + uvec3_ubo_nbytes;
341309
}
342310

343311
//
@@ -595,8 +563,9 @@ vTensor::vTensor(
595563
packed_dim_,
596564
storage_type)),
597565
// Related to tensor metadata UBOs
598-
nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
599-
max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)},
566+
min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
567+
max_ubo_nbytes_{
568+
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
600569
uniforms_(),
601570
// Construct Tensor storage
602571
storage_(std::make_shared<vTensorStorage>(
@@ -607,23 +576,13 @@ vTensor::vTensor(
607576
sizes,
608577
dtype_,
609578
allocate_memory)) {
610-
// Derived metadata
611-
std::vector<int64_t> whcn_dim_order(4, 0);
612-
std::vector<int64_t> unsqueezed_strides(4, 0);
613-
// Only calculate derived metadata if needed for the desired storage type.
614-
// Note that logical limits may be used by buffer storage as well in order to
615-
// set global work group sizes for some compute shaders.
616-
if (storage_type == utils::kBuffer) {
617-
whcn_dim_order = create_whcn_dim_order(dim_order_);
618-
unsqueezed_strides = unsqueeze_strides(strides_, numel_);
619-
}
620-
621579
uniform_data_ = std::make_shared<UniformData>(UniformData{
580+
numel_,
622581
sizes_,
623-
whcn_dim_order,
624-
unsqueezed_strides,
625-
calculate_logical_limits(storage_->image_extents_, axis_map_),
626-
numel_});
582+
dim_order_,
583+
strides_,
584+
calculate_logical_limits(storage_->image_extents_, axis_map_)});
585+
627586
VK_CHECK_COND(
628587
dim_order_is_valid(dim_order_), "computed dim order is invalid");
629588
}
@@ -648,18 +607,18 @@ vTensor::vTensor(
648607
packed_dim_,
649608
utils::kTexture3D)),
650609
// Related to tensor metadata UBOs
651-
nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
610+
min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
652611
max_ubo_nbytes_{
653-
calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)},
612+
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
654613
uniforms_(),
655614
// Construct Tensor storage
656615
storage_(std::make_shared<vTensorStorage>(context, image)) {
657616
uniform_data_ = std::make_shared<UniformData>(UniformData{
617+
numel_,
658618
sizes_,
659619
{0, 0, 0, 0},
660620
{0, 0, 0, 0},
661-
calculate_logical_limits(storage_->image_extents_, axis_map_),
662-
numel_});
621+
calculate_logical_limits(storage_->image_extents_, axis_map_)});
663622
}
664623

665624
vTensor::vTensor(vTensor& other)
@@ -672,7 +631,7 @@ vTensor::vTensor(vTensor& other)
672631
strides_(other.strides_.begin(), other.strides_.end()),
673632
numel_(other.numel_),
674633
hashed_layout_(other.hashed_layout_),
675-
nbytes_per_ubo_{other.nbytes_per_ubo_},
634+
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
676635
max_ubo_nbytes_{other.max_ubo_nbytes_},
677636
uniforms_(),
678637
// Copy Tensor storage
@@ -697,22 +656,35 @@ vTensor::vTensor(
697656
axis_map_,
698657
packed_dim_,
699658
other.storage_type())),
700-
nbytes_per_ubo_{other.nbytes_per_ubo_},
659+
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
701660
max_ubo_nbytes_{other.max_ubo_nbytes_},
702661
uniforms_(),
703662
// Copy Tensor storage
704663
storage_(other.storage_) {
705664
uniform_data_ = std::make_shared<UniformData>(UniformData{
665+
static_cast<size_t>(utils::multiply_integers(sizes_)),
706666
sizes_,
707-
create_whcn_dim_order(dim_order_),
708-
unsqueeze_strides(strides_, numel_),
709-
other.logical_limits(),
710-
static_cast<size_t>(utils::multiply_integers(sizes_))});
667+
dim_order_,
668+
strides_,
669+
other.logical_limits()});
711670

712671
VK_CHECK_COND(
713672
dim_order_is_valid(dim_order_), "new dim order provided is invalid");
714673
}
715674

675+
vTensor::UniformData::UniformData(
676+
const size_t numel_ll,
677+
const std::vector<int64_t>& sizes,
678+
const std::vector<int64_t>& dim_order,
679+
const std::vector<int64_t>& strides,
680+
const utils::uvec3& limits)
681+
: numel(utils::safe_downcast<int32_t>(numel_ll)),
682+
sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes, numel_ll)),
683+
dim_order_v(
684+
flip_and_unsqueeze_ivec4(dim_order, kTensorDimOrder, numel_ll)),
685+
strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides, numel_ll)),
686+
logical_limits(limits) {}
687+
716688
uint32_t vTensor::UniformData::write_attribute(
717689
void* dst,
718690
const uint32_t dst_offset,
@@ -727,11 +699,11 @@ uint32_t vTensor::UniformData::write_attribute(
727699
return sizeof(member_name); \
728700
}
729701
switch (attr) {
702+
WRITE_ATTRIBUTE_CASE(NUMEL, numel);
730703
WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
731-
WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v);
704+
WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, dim_order_v);
732705
WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
733706
WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
734-
WRITE_ATTRIBUTE_CASE(NUMEL, numel);
735707
default:
736708
VK_THROW("Invalid Attribute");
737709
}
@@ -806,84 +778,25 @@ size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
806778
}
807779

808780
const vkapi::BufferBindInfo vTensor::sizes_ubo() {
809-
if (!uniforms_.buffer()) {
810-
uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
811-
}
812-
if (sizes_uniform_offset_ == kUniformOffsetUnset) {
813-
VK_CHECK_COND(
814-
(uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
815-
"Uniform data allocation has exceeded Tensor uniform buffer size");
816-
sizes_uniform_offset_ = uniforms_size_;
817-
uniforms_size_ += nbytes_per_ubo_;
818-
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
819-
}
820-
return vkapi::BufferBindInfo(
821-
uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_);
781+
return metadata_ubo_impl(&sizes_uniform_offset_, uniform_data_->sizes_v);
822782
}
823783

824784
const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
825-
if (!uniforms_.buffer()) {
826-
uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
827-
}
828-
if (dim_order_uniform_offset_ == kUniformOffsetUnset) {
829-
VK_CHECK_COND(
830-
(uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
831-
"Uniform data allocation has exceeded Tensor uniform buffer size");
832-
dim_order_uniform_offset_ = uniforms_size_;
833-
uniforms_size_ += nbytes_per_ubo_;
834-
uniforms_.update(
835-
uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
836-
}
837-
return vkapi::BufferBindInfo(
838-
uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_);
785+
return metadata_ubo_impl(
786+
&dim_order_uniform_offset_, uniform_data_->dim_order_v);
839787
}
840788

841789
const vkapi::BufferBindInfo vTensor::strides_ubo() {
842-
if (!uniforms_.buffer()) {
843-
uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
844-
}
845-
if (strides_uniform_offset == kUniformOffsetUnset) {
846-
VK_CHECK_COND(
847-
(uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
848-
"Uniform data allocation has exceeded Tensor uniform buffer size");
849-
strides_uniform_offset = uniforms_size_;
850-
uniforms_size_ += nbytes_per_ubo_;
851-
uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
852-
}
853-
return vkapi::BufferBindInfo(
854-
uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_);
790+
return metadata_ubo_impl(&strides_uniform_offset, uniform_data_->strides_v);
855791
}
856792

857793
const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
858-
if (!uniforms_.buffer()) {
859-
uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
860-
}
861-
if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
862-
VK_CHECK_COND(
863-
(uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
864-
"Uniform data allocation has exceeded Tensor uniform buffer size");
865-
logical_limits_uniform_offset_ = uniforms_size_;
866-
uniforms_size_ += nbytes_per_ubo_;
867-
uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
868-
}
869-
return vkapi::BufferBindInfo(
870-
uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_);
794+
return metadata_ubo_impl(
795+
&logical_limits_uniform_offset_, uniform_data_->logical_limits);
871796
}
872797

873798
const vkapi::BufferBindInfo vTensor::numel_ubo() {
874-
if (!uniforms_.buffer()) {
875-
uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
876-
}
877-
if (numel_uniform_offset_ == kUniformOffsetUnset) {
878-
VK_CHECK_COND(
879-
(uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
880-
"Uniform data allocation has exceeded Tensor uniform buffer size");
881-
numel_uniform_offset_ = uniforms_size_;
882-
uniforms_size_ += nbytes_per_ubo_;
883-
uniforms_.update(numel(), numel_uniform_offset_);
884-
}
885-
return vkapi::BufferBindInfo(
886-
uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_);
799+
return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
887800
}
888801

889802
VkMemoryRequirements vTensor::get_memory_requirements() const {
@@ -936,22 +849,21 @@ void vTensor::update_metadata() {
936849
strides_ = calculate_strides(sizes_, dim_order_);
937850

938851
// Update uniform data if it has been modified
939-
uniform_data_->numel = numel_;
940-
uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
941-
uniform_data_->whcn_dim_order_v =
942-
utils::make_ivec4(create_whcn_dim_order(dim_order_));
943-
uniform_data_->strides_v =
944-
utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_));
945852
uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
853+
uniform_data_->sizes_v =
854+
flip_and_unsqueeze_ivec4(sizes_, kTensorSizes, numel_);
855+
uniform_data_->dim_order_v =
856+
flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
857+
uniform_data_->strides_v =
858+
flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
946859
uniform_data_->logical_limits.limits =
947860
calculate_logical_limits(sizes_, axis_map_, packed_dim_);
948861

949862
if (sizes_uniform_offset_ != kUniformOffsetUnset) {
950863
uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
951864
}
952865
if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
953-
uniforms_.update(
954-
uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
866+
uniforms_.update(uniform_data_->dim_order_v, dim_order_uniform_offset_);
955867
}
956868
if (strides_uniform_offset != kUniformOffsetUnset) {
957869
uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);

0 commit comments

Comments
 (0)