Skip to content

Commit 9b4242e

Browse files
author
ssjia
committed
[ET-VK] Introduce BufferMetadata GLSL struct to abstract tensor layout
Pull Request resolved: #13595 As title; introduce an consolidated metadata UBO for buffer storage that can be used to abstract tensor indexing operations for buffer-backed tensors. This new metadata UBO is capable of representing tensors of up to 8 dimensions. This upper limit is hardcoded, but can be increased later on without needing to update callsites since everything is abstracted by the BufferMetadata struct. Update the following ops to use this new metadata UBO: * staging shaders (nchw_to_buffer and buffer_to_nchw) * binary op @imported-using-ghimport Differential Revision: [D80800082](https://our.internmc.facebook.com/intern/diff/D80800082/) ghstack-source-id: 305063054
1 parent b3af325 commit 9b4242e

File tree

13 files changed

+374
-102
lines changed

13 files changed

+374
-102
lines changed

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,7 @@ vTensor::vTensor(
566566
max_ubo_nbytes_{
567567
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
568568
uniforms_(),
569+
buffer_meta_(),
569570
// Construct Tensor storage
570571
storage_(std::make_shared<vTensorStorage>(
571572
context,
@@ -610,6 +611,7 @@ vTensor::vTensor(
610611
max_ubo_nbytes_{
611612
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
612613
uniforms_(),
614+
buffer_meta_(),
613615
// Construct Tensor storage
614616
storage_(std::make_shared<vTensorStorage>(context, image)) {
615617
uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -633,6 +635,7 @@ vTensor::vTensor(vTensor& other)
633635
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
634636
max_ubo_nbytes_{other.max_ubo_nbytes_},
635637
uniforms_(),
638+
buffer_meta_(),
636639
// Copy Tensor storage
637640
storage_(other.storage_) {
638641
uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -658,6 +661,7 @@ vTensor::vTensor(
658661
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
659662
max_ubo_nbytes_{other.max_ubo_nbytes_},
660663
uniforms_(),
664+
buffer_meta_(),
661665
// Copy Tensor storage
662666
storage_(other.storage_) {
663667
uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -710,6 +714,38 @@ uint32_t vTensor::UniformData::write_attribute(
710714
return 0;
711715
}
712716

717+
vTensor::BufferMetadata::BufferMetadata(
718+
std::vector<int64_t> src_sizes,
719+
std::vector<int64_t> src_dim_order,
720+
std::vector<int64_t> src_strides,
721+
size_t src_numel) {
722+
update(src_sizes, src_dim_order, src_strides, src_numel);
723+
}
724+
725+
void vTensor::BufferMetadata::update(
726+
std::vector<int64_t> src_sizes,
727+
std::vector<int64_t> src_dim_order,
728+
std::vector<int64_t> src_strides,
729+
size_t src_numel) {
730+
int32_t fixed_ndim = utils::safe_downcast<int32_t>(kTensorDimLimit);
731+
732+
std::vector<uint32_t> fu_sizes = flip_and_unsqueeze<uint32_t>(
733+
src_sizes, kTensorSizes, src_numel, fixed_ndim);
734+
std::vector<uint32_t> fu_dim_order = flip_and_unsqueeze<uint32_t>(
735+
src_dim_order, kTensorDimOrder, src_numel, fixed_ndim);
736+
std::vector<uint32_t> fu_strides = flip_and_unsqueeze<uint32_t>(
737+
src_strides, kTensorStrides, src_numel, fixed_ndim);
738+
739+
for (int i = 0; i < fixed_ndim; ++i) {
740+
sizes[i] = fu_sizes.at(i);
741+
dim_order[i] = fu_dim_order.at(i);
742+
strides[i] = fu_strides.at(i);
743+
}
744+
745+
ndim = utils::safe_downcast<uint32_t>(src_sizes.size());
746+
numel = utils::safe_downcast<uint32_t>(src_numel);
747+
}
748+
713749
vkapi::VulkanImage& vTensor::image(
714750
vkapi::PipelineBarrier& pipeline_barrier,
715751
const vkapi::PipelineStageFlags stage) & {
@@ -798,6 +834,15 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() {
798834
return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
799835
}
800836

837+
const vkapi::BufferBindInfo vTensor::buffer_meta_ubo() {
838+
size_t ubo_nbytes = std::min(sizeof(BufferMetadata), min_nbytes_per_ubo_);
839+
if (!buffer_meta_.buffer()) {
840+
BufferMetadata data(sizes_, dim_order_, strides_, numel_);
841+
buffer_meta_ = ParamsBuffer(storage_->context_, data);
842+
}
843+
return vkapi::BufferBindInfo(buffer_meta_.buffer(), 0, ubo_nbytes);
844+
}
845+
801846
VkMemoryRequirements vTensor::get_memory_requirements() const {
802847
switch (storage_type()) {
803848
case utils::kBuffer:
@@ -874,6 +919,11 @@ void vTensor::update_metadata() {
874919
uniforms_.update(
875920
uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
876921
}
922+
923+
if (buffer_meta_.buffer()) {
924+
BufferMetadata data(sizes_, dim_order_, strides_, numel_);
925+
buffer_meta_.update(data);
926+
}
877927
}
878928

879929
void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {

backends/vulkan/runtime/api/containers/Tensor.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
namespace vkcompute {
2020
namespace api {
2121

22+
static constexpr size_t kTensorDimLimit = 8;
23+
2224
/*
2325
* Given a GPUMemoryLayout value, produce a dim order vector that matches the
2426
* given memory layout. The produced dim order vector will be in the NCHW
@@ -262,6 +264,26 @@ class vTensor final {
262264
const Attribute attr);
263265
};
264266

267+
struct BufferMetadata {
268+
uint32_t sizes[kTensorDimLimit];
269+
uint32_t dim_order[kTensorDimLimit];
270+
uint32_t strides[kTensorDimLimit];
271+
uint32_t ndim;
272+
uint32_t numel;
273+
274+
BufferMetadata(
275+
std::vector<int64_t> sizes,
276+
std::vector<int64_t> dim_order,
277+
std::vector<int64_t> strides,
278+
size_t numel);
279+
280+
void update(
281+
std::vector<int64_t> sizes,
282+
std::vector<int64_t> dim_order,
283+
std::vector<int64_t> strides,
284+
size_t numel);
285+
};
286+
265287
private:
266288
/*
267289
* "Core" tensor metadata. They are the minimum amount of information required
@@ -332,6 +354,11 @@ class vTensor final {
332354
*/
333355
ParamsBuffer uniforms_;
334356

357+
/*
358+
* TODO: explain
359+
*/
360+
ParamsBuffer buffer_meta_;
361+
335362
uint32_t uniforms_size_ = 0u;
336363
uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
337364
uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
@@ -557,6 +584,8 @@ class vTensor final {
557584

558585
const vkapi::BufferBindInfo numel_ubo();
559586

587+
const vkapi::BufferBindInfo buffer_meta_ubo();
588+
560589
public:
561590
inline size_t staging_buffer_numel() const {
562591
return storage_->buffer_len();

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,10 @@ class ComputeGraph final {
357357
return values_.at(idx).toConstTensor().has_buffer_storage();
358358
}
359359

360+
inline bool is_texture_storage(const ValueRef idx) const {
361+
return !is_buffer_storage(idx);
362+
}
363+
360364
/*
361365
* Checks that the following is true:
362366
* 1. The value at `idx` is a tensor
@@ -411,6 +415,10 @@ class ComputeGraph final {
411415
return values_.at(idx).toTensor().sizes_ubo();
412416
}
413417

418+
inline vkapi::BufferBindInfo buffer_meta_ubo(const ValueRef idx) {
419+
return values_.at(idx).toTensor().buffer_meta_ubo();
420+
}
421+
414422
inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
415423
return values_.at(idx).toTensor().strides_ubo();
416424
}

backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ $if IS_COMPARISON_OP:
3434

3535
layout(std430) buffer;
3636

37+
#include "indexing.glslh"
38+
3739
$if IS_COMPARISON_OP:
3840
${layout_declare_tensor(B, "w", "t_out", "uint8", STORAGE)}
3941
$else:
@@ -43,13 +45,11 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
4345
${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
4446

4547
$if STORAGE == "buffer":
48+
${layout_declare_ubo(B, "BufferMetadata", "outp")}
49+
${layout_declare_ubo(B, "BufferMetadata", "inp")}
50+
${layout_declare_ubo(B, "BufferMetadata", "other")}
51+
4652
layout(push_constant) uniform restrict Block {
47-
ivec4 in_sizes;
48-
ivec4 other_sizes;
49-
ivec4 out_strides;
50-
ivec4 in_strides;
51-
ivec4 other_strides;
52-
int out_numel;
5353
float alpha;
5454
};
5555
$else:
@@ -83,25 +83,30 @@ $else:
8383
#ifdef USING_BUFFER
8484

8585
void main() {
86-
const int out_bufi = ivec3(gl_GlobalInvocationID).x;
87-
if (out_bufi >= out_numel) {
86+
const uint out_bufi = gl_GlobalInvocationID.x;
87+
if (out_bufi >= numel(outp)) {
8888
return;
8989
}
9090

9191
// Simple case; no broadcasting
92-
if (in_sizes == other_sizes) {
92+
if (are_equal(inp, other)) {
9393
t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
9494
return;
9595
}
9696

97-
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
98-
const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
99-
const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
97+
TensorIndex outp_tidx;
98+
linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
99+
100+
TensorIndex inp_tidx = outp_tidx;
101+
clamp_tensor_idx(inp, inp_tidx);
102+
103+
TensorIndex other_tidx = outp_tidx;
104+
clamp_tensor_idx(other, other_tidx);
100105

101-
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
102-
const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
106+
uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
107+
uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
103108

104-
t_out[out_bufi] = T(op(t_in[in_bufi], t_other[other_bufi], T(alpha)));
109+
t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
105110
}
106111

107112
#else // USING_TEXTURE

backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,40 +4,33 @@
44

55
#define T ${buffer_scalar_type(DTYPE)}
66

7-
#include "indexing_utils.h"
8-
97
${define_required_extensions(DTYPE)}
108

119
layout(std430) buffer;
1210

13-
${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)}
14-
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
11+
#include "indexing.glslh"
12+
13+
${layout_declare_tensor(B, "w", "nchw_buf", DTYPE, STORAGE)}
14+
${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
1515

16-
$if USE_PUSH_CONST:
17-
layout(push_constant) uniform restrict Block {
18-
ivec4 in_sizes;
19-
ivec4 in_strides;
20-
int numel;
21-
};
22-
$else:
23-
${layout_declare_ubo(2, "ivec4", "in_sizes")}
24-
${layout_declare_ubo(3, "ivec4", "in_strides")}
25-
${layout_declare_ubo(4, "int", "numel")}
16+
${layout_declare_ubo(B, "BufferMetadata", "inp")}
2617

2718
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2819

2920
// This constant is unused in this shader but is kept so that the signature is
3021
// consistent with image_to_nchw.
31-
layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
22+
${layout_declare_spec_const(C, "int", "unused", "0")}
3223

3324
void main() {
34-
int nchwi = int(gl_GlobalInvocationID.x);
35-
if (nchwi >= numel) {
25+
uint inp_bufi = gl_GlobalInvocationID.x;
26+
if (inp_bufi>= numel(inp)) {
3627
return;
3728
}
3829

39-
ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes);
40-
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
30+
TensorIndex inp_tidx;
31+
linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
32+
33+
uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
4134

42-
nchw_buf[nchwi] = t_in[in_bufi];
35+
nchw_buf[nchwi] = t_inp[inp_bufi];
4336
}

backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,3 @@ buffer_to_nchw:
1919
- VALUE: int32
2020
shader_variants:
2121
- NAME: buffer_to_nchw
22-
- NAME: buffer_to_nchw_no_pc
23-
USE_PUSH_CONST: False

0 commit comments

Comments
 (0)