Skip to content

Commit 425989f

Browse files
committed
Update on "[ET-VK] Replace Uniform buffers with push constants for view op"
This diff replaces uniform buffers with push constants for view op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. Differential Revision: [D66733658](https://our.internmc.facebook.com/intern/diff/D66733658/) [ghstack-poisoned]
2 parents 8d94305 + ad01ffa commit 425989f

File tree

21 files changed

+379
-103
lines changed

21 files changed

+379
-103
lines changed

.ci/scripts/setup-vulkan-linux-deps.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ install_swiftshader() {
2727

2828
install_vulkan_sdk() {
2929
VULKAN_SDK_VERSION=$1
30-
_vulkan_sdk_url="https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
30+
_vulkan_sdk_url="https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.xz"
3131

3232
_vulkan_sdk_dir=/tmp/vulkansdk
3333
mkdir -p $_vulkan_sdk_dir
@@ -37,12 +37,12 @@ install_vulkan_sdk() {
3737
curl --silent --show-error --location --fail --retry 3 \
3838
--output "${_tmp_archive}" "${_vulkan_sdk_url}"
3939

40-
tar -C "${_vulkan_sdk_dir}" -xzf "${_tmp_archive}"
40+
tar -C "${_vulkan_sdk_dir}" -xJf "${_tmp_archive}"
4141

4242
export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
4343
}
4444

45-
VULKAN_SDK_VERSION="1.2.198.1"
45+
VULKAN_SDK_VERSION="1.3.296.0"
4646

4747
install_swiftshader
4848
install_vulkan_sdk "${VULKAN_SDK_VERSION}"

backends/arm/quantizer/quantization_annotation/generic_annotator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
torch.ops.aten.tile.default,
5454
torch.ops.aten.flip.default,
5555
torch.ops.aten.cat.default,
56+
torch.ops.aten.concatenate.default,
5657
torch.ops.aten.stack.default,
5758
torch.ops.aten.chunk.default,
5859
torch.ops.aten.contiguous.default,

backends/arm/test/quantizer/test_generic_annotater.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,10 @@ def test_flip(self):
8686
self.check_annotation(
8787
SingleOpModel(torch.flip, (torch.randn(2, 4),), dims=(0, 1)),
8888
)
89+
90+
def test_concat(self):
91+
self.check_annotation(
92+
SingleOpModel(
93+
torch.concatenate, ((torch.randn(2, 3), torch.randn(2, 3)),), dim=0
94+
),
95+
)

backends/cadence/fusion_g3/operators/op_quantize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ Tensor& quantize_per_tensor_out(
570570
err == torch::executor::Error::Ok,
571571
"Failed to resize out Tensor in quantize_per_tensor_out");
572572

573-
check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
573+
// check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
574574

575575
float scale_data = (float)scale;
576576
int zero_point_data = (int)zero_point;
@@ -696,7 +696,7 @@ Tensor& quantize_per_channel_out(
696696
zero_point.numel(),
697697
input.size(axis));
698698

699-
check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
699+
// check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
700700

701701
const double* scale_dt = scale.const_data_ptr<double>();
702702
const int64_t* zero_point_dt = zero_point.const_data_ptr<int64_t>();

backends/vulkan/runtime/api/Context.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ void Context::register_shader_dispatch(
119119
const vkapi::DescriptorSet& descriptors,
120120
vkapi::PipelineBarrier& pipeline_barrier,
121121
const vkapi::ShaderInfo& shader_descriptor,
122-
const utils::uvec3& global_workgroup_size) {
122+
const utils::uvec3& global_workgroup_size,
123+
const void* push_constants_data,
124+
const uint32_t push_constants_size) {
123125
// Adjust the global workgroup size based on the output tile size
124126
uint32_t global_wg_w = utils::div_up(
125127
global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]);
@@ -145,6 +147,15 @@ void Context::register_shader_dispatch(
145147
cmd_.bind_descriptors(descriptors.get_bind_handle());
146148
cmd_.insert_barrier(pipeline_barrier);
147149

150+
if (push_constants_size > 0 && push_constants_data != nullptr) {
151+
const VkDescriptorSetLayout shader_layout =
152+
shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
153+
const VkPipelineLayout pipeline_layout =
154+
pipeline_layout_cache().retrieve(shader_layout);
155+
cmd_.set_push_constants(
156+
pipeline_layout, push_constants_data, push_constants_size);
157+
}
158+
148159
cmd_.dispatch(effective_global_wg);
149160
}
150161

backends/vulkan/runtime/api/Context.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ class Context final {
200200
const vkapi::DescriptorSet&,
201201
vkapi::PipelineBarrier&,
202202
const vkapi::ShaderInfo&,
203-
const utils::uvec3&);
203+
const utils::uvec3&,
204+
const void* = nullptr,
205+
const uint32_t = 0);
204206

205207
void register_blit(
206208
vkapi::PipelineBarrier&,

backends/vulkan/runtime/api/containers/StagingBuffer.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ class StagingBuffer final {
2323
private:
2424
Context* context_p_;
2525
vkapi::ScalarType dtype_;
26-
size_t numel_;
27-
size_t nbytes_;
2826
vkapi::VulkanBuffer vulkan_buffer_;
2927

3028
void* mapped_data_;
@@ -36,10 +34,8 @@ class StagingBuffer final {
3634
const size_t numel)
3735
: context_p_(context_p),
3836
dtype_(dtype),
39-
numel_(numel),
40-
nbytes_(element_size(dtype_) * numel_),
41-
vulkan_buffer_(
42-
context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)),
37+
vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
38+
element_size(dtype_) * numel)),
4339
mapped_data_(nullptr) {}
4440

4541
StagingBuffer(const StagingBuffer&) = delete;
@@ -68,15 +64,15 @@ class StagingBuffer final {
6864
}
6965

7066
inline size_t numel() {
71-
return numel_;
67+
return nbytes() / element_size(dtype_);
7268
}
7369

7470
inline size_t nbytes() {
75-
return nbytes_;
71+
return vulkan_buffer_.mem_size();
7672
}
7773

7874
inline void copy_from(const void* src, const size_t nbytes) {
79-
VK_CHECK_COND(nbytes <= nbytes_);
75+
VK_CHECK_COND(nbytes <= this->nbytes());
8076
memcpy(data(), src, nbytes);
8177
vmaFlushAllocation(
8278
vulkan_buffer_.vma_allocator(),
@@ -86,7 +82,7 @@ class StagingBuffer final {
8682
}
8783

8884
inline void copy_to(void* dst, const size_t nbytes) {
89-
VK_CHECK_COND(nbytes <= nbytes_);
85+
VK_CHECK_COND(nbytes <= this->nbytes());
9086
vmaInvalidateAllocation(
9187
vulkan_buffer_.vma_allocator(),
9288
vulkan_buffer_.allocation(),
@@ -96,7 +92,7 @@ class StagingBuffer final {
9692
}
9793

9894
inline void set_staging_zeros() {
99-
memset(data(), 0, nbytes_);
95+
memset(data(), 0, nbytes());
10096
}
10197
};
10298

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88

99
#include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
10+
#include <cstring>
1011

1112
namespace vkcompute {
1213
namespace api {
@@ -446,11 +447,10 @@ vTensor::vTensor(
446447
dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
447448
axis_map_(default_axis_map()),
448449
strides_(calculate_strides(sizes, dim_order_)),
449-
numel_(utils::multiply_integers(sizes_)),
450450
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
451-
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
451+
unsqueezed_strides_{
452+
unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
452453
padded_numel_(utils::multiply_integers(padded_sizes_)),
453-
logical_limits_{{0, 0, 0}},
454454
uniforms_(),
455455
// Utility Uniform Buffers that can be passed to shaders as arguments
456456
uniforms_size_(0),
@@ -467,6 +467,11 @@ vTensor::vTensor(
467467
padded_sizes_,
468468
dtype_,
469469
allocate_memory) {
470+
uniform_data_ = std::make_shared<UniformData>(UniformData{
471+
sizes_,
472+
unsqueezed_strides_,
473+
{{0, 0, 0}},
474+
static_cast<size_t>(utils::multiply_integers(sizes_))});
470475
VK_CHECK_COND(
471476
dim_order_is_valid(dim_order_), "computed dim order is invalid");
472477

@@ -494,11 +499,9 @@ vTensor::vTensor(
494499
dim_order_(),
495500
axis_map_(default_axis_map()),
496501
strides_(),
497-
numel_(utils::multiply_integers(sizes_)),
498502
padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)),
499503
unsqueezed_strides_(),
500504
padded_numel_(utils::multiply_integers(padded_sizes_)),
501-
logical_limits_(),
502505
uniforms_(),
503506
// Utility Uniform Buffers that can be passed to shaders as arguments
504507
uniforms_size_(0),
@@ -508,6 +511,11 @@ vTensor::vTensor(
508511
logical_limits_uniform_offset_(kUniformOffsetUnset),
509512
// Construct Tensor storage
510513
storage_(context, image) {
514+
uniform_data_ = std::make_shared<UniformData>(UniformData{
515+
sizes_,
516+
{0, 0, 0, 0},
517+
{{0, 0, 0}},
518+
static_cast<size_t>(utils::multiply_integers(sizes_))});
511519
set_logical_limits(storage_.image_extents_);
512520
}
513521

@@ -519,13 +527,11 @@ vTensor::vTensor(vTensor& other)
519527
dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
520528
axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
521529
strides_(other.strides_.begin(), other.strides_.end()),
522-
numel_(other.numel_),
523530
padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
524531
unsqueezed_strides_{
525532
other.unsqueezed_strides_.begin(),
526533
other.unsqueezed_strides_.end()},
527534
padded_numel_(other.padded_numel_),
528-
logical_limits_{other.logical_limits_},
529535
uniforms_(),
530536
// Empty initialize Utility Uniform Buffers
531537
uniforms_size_(0),
@@ -534,7 +540,9 @@ vTensor::vTensor(vTensor& other)
534540
numel_uniform_offset_(kUniformOffsetUnset),
535541
logical_limits_uniform_offset_(kUniformOffsetUnset),
536542
// Copy Tensor storage
537-
storage_(other.storage_) {}
543+
storage_(other.storage_) {
544+
uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
545+
}
538546

539547
vTensor::vTensor(
540548
vTensor& other,
@@ -548,11 +556,10 @@ vTensor::vTensor(
548556
dim_order_(dim_order.begin(), dim_order.end()),
549557
axis_map_(default_axis_map()),
550558
strides_(calculate_strides(sizes_, dim_order_)),
551-
numel_(utils::multiply_integers(sizes_)),
552559
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
553-
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
560+
unsqueezed_strides_{
561+
unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
554562
padded_numel_(utils::multiply_integers(padded_sizes_)),
555-
logical_limits_(other.logical_limits_),
556563
uniforms_(),
557564
// Empty initialize Utility Uniform Buffers
558565
uniforms_size_(0),
@@ -562,14 +569,45 @@ vTensor::vTensor(
562569
logical_limits_uniform_offset_(kUniformOffsetUnset),
563570
// Copy Tensor storage
564571
storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
572+
uniform_data_ = std::make_shared<UniformData>(UniformData{
573+
sizes_,
574+
unsqueezed_strides_,
575+
{other.logical_limits()},
576+
static_cast<size_t>(utils::multiply_integers(sizes_))});
577+
565578
VK_CHECK_COND(
566579
dim_order_is_valid(dim_order_), "new dim order provided is invalid");
567580
VK_CHECK_COND(
568-
offset_numel + numel_ <= other.numel(),
581+
offset_numel + numel() <= other.numel(),
569582
"Tensor alias cannot access more elements than available in the original"
570583
"tensor");
571584
}
572585

586+
uint32_t vTensor::UniformData::write_attribute(
587+
void* dst,
588+
const uint32_t dst_offset,
589+
const uint32_t max_dst_size,
590+
const Attribute attr) {
591+
#define WRITE_ATTRIBUTE_CASE(enum_name, member_name) \
592+
case vTensor::Attribute::enum_name: { \
593+
VK_CHECK_COND( \
594+
(dst_offset + sizeof(member_name)) <= max_dst_size, \
595+
"Attempting to write tensor attribute outside data boundary."); \
596+
memcpy((uint8_t*)dst + dst_offset, &member_name, sizeof(member_name)); \
597+
return sizeof(member_name); \
598+
}
599+
switch (attr) {
600+
WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
601+
WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
602+
WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
603+
WRITE_ATTRIBUTE_CASE(NUMEL, numel);
604+
default:
605+
VK_THROW("Invalid Attribute");
606+
}
607+
#undef WRITE_ATTRIBUTE_CASE
608+
return 0;
609+
}
610+
573611
vkapi::VulkanImage& vTensor::image(
574612
vkapi::PipelineBarrier& pipeline_barrier,
575613
const vkapi::PipelineStageFlags stage) & {
@@ -601,9 +639,9 @@ vkapi::VulkanBuffer& vTensor::buffer(
601639
}
602640

603641
void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
604-
logical_limits_.limits[0] = image_extents[axis_map_.at(0)];
605-
logical_limits_.limits[1] = image_extents[axis_map_.at(1)];
606-
logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
642+
uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)];
643+
uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)];
644+
uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)];
607645
}
608646

609647
utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
@@ -661,7 +699,7 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
661699
"Uniform data allocation has exceeded Tensor uniform buffer size");
662700
logical_limits_uniform_offset_ = uniforms_size_;
663701
uniforms_size_ += kSizePerUniform;
664-
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
702+
uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
665703
}
666704
return vkapi::BufferBindInfo(
667705
uniforms_.buffer(), logical_limits_uniform_offset_);
@@ -677,7 +715,7 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() {
677715
"Uniform data allocation has exceeded Tensor uniform buffer size");
678716
numel_uniform_offset_ = uniforms_size_;
679717
uniforms_size_ += kSizePerUniform;
680-
uniforms_.update(numel_, numel_uniform_offset_);
718+
uniforms_.update(numel(), numel_uniform_offset_);
681719
}
682720
return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
683721
}
@@ -687,10 +725,10 @@ size_t vTensor::staging_buffer_numel() const {
687725
const bool int8_supported =
688726
storage_.context_->adapter_ptr()->has_full_int8_buffers_support();
689727
if (is_int8 && !int8_supported) {
690-
return utils::align_up_4(numel_);
728+
return utils::align_up_4(numel());
691729
}
692730
if (storage_type() == utils::kBuffer) {
693-
return numel_;
731+
return numel();
694732
}
695733
return padded_numel_;
696734
}
@@ -720,30 +758,32 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
720758

721759
void vTensor::update_metadata() {
722760
strides_ = calculate_strides(sizes_, dim_order_);
723-
numel_ = utils::multiply_integers(sizes_);
761+
uniform_data_->numel = utils::multiply_integers(sizes_);
724762

725763
padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
726-
unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
764+
unsqueezed_strides_ = unsqueeze_strides(strides_, numel());
727765
padded_numel_ = utils::multiply_integers(padded_sizes_);
728766

767+
// Update uniform data if it has been modified
768+
uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
769+
uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_);
770+
729771
// Calculate the image extents that would have been used to allocate a texture
730772
// withthe current sizes, and use that to set the logical limits.
731773
set_logical_limits(
732774
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
733775

734776
if (sizes_uniform_offset_ != kUniformOffsetUnset) {
735-
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
777+
uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
736778
}
737779
if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
738-
uniforms_.update(
739-
utils::make_whcn_ivec4(unsqueezed_strides_),
740-
unsqueezed_strides_offset_);
780+
uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_);
741781
}
742782
if (numel_uniform_offset_ != kUniformOffsetUnset) {
743-
uniforms_.update(numel_, numel_uniform_offset_);
783+
uniforms_.update(numel(), numel_uniform_offset_);
744784
}
745785
if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
746-
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
786+
uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
747787
}
748788
}
749789

@@ -796,6 +836,8 @@ void vTensor::virtual_clone(const vTensor& other) {
796836
dim_order_ = other.dim_order_;
797837
axis_map_ = other.axis_map_;
798838
packed_dim_ = other.packed_dim_;
839+
840+
*uniform_data_ = *other.get_uniform_data();
799841
}
800842

801843
void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {

0 commit comments

Comments
 (0)