Skip to content

Commit 728a29d

Browse files
authored
Pack buffer-backed tensors correctly when moving into and out of staging
Differential Revision: D61150844 Pull Request resolved: #4673
1 parent 8f46971 commit 728a29d

File tree

10 files changed

+177
-22
lines changed

10 files changed

+177
-22
lines changed

backends/vulkan/runtime/api/containers/Tensor.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ class vTensor final {
277277
return sizes_.size();
278278
}
279279

280+
inline const std::vector<int64_t>& strides() const {
281+
return strides_;
282+
}
283+
284+
inline const std::vector<int64_t>& unsqueezed_strides() const {
285+
return unsqueezed_strides_;
286+
}
287+
280288
/*
281289
* Returns a GPU buffer containing the sizes of the tensor in WHCN order.
282290
* Note that dimensions that are not present in the tensor's sizes are set to

backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
#version 450 core
32

43
#define PRECISION ${PRECISION}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
#define T ${buffer_scalar_type(DTYPE)}
6+
7+
#include "indexing_utils.h"
8+
9+
${define_required_extensions(DTYPE)}
10+
11+
layout(std430) buffer;
12+
13+
${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)}
14+
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
15+
${layout_declare_ubo(2, "ivec4", "in_sizes")}
16+
${layout_declare_ubo(3, "ivec4", "in_strides")}
17+
${layout_declare_ubo(4, "int", "numel")}
18+
19+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
20+
21+
// This constant is unused in this shader but is kept so that the signature is
22+
// consistent with image_to_nchw.
23+
layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
24+
25+
void main() {
26+
int out_id = int(gl_GlobalInvocationID.x);
27+
if (out_id >= numel) {
28+
return;
29+
}
30+
31+
ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes);
32+
const int in_id = to_buffer_id(t_in_idx, in_strides);
33+
34+
nchw_buf[out_id] = t_in[in_id];
35+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
buffer_to_nchw:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
generate_variant_forall:
12+
DTYPE:
13+
- VALUE: half
14+
- VALUE: float
15+
- VALUE: int
16+
- VALUE: int8
17+
shader_variants:
18+
- NAME: buffer_to_nchw

backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,21 @@
4141
*/
4242
#define alignup4(x) ((x + 3) & -4)
4343

44+
/*
45+
* Input: (W, H, C, N) strides of a tensor
46+
* Returns: the WHCN index of the fastest moving dimension
47+
*/
48+
int find_packed_dim(const ivec4 strides) {
49+
int packed_dim = 0;
50+
for (int i = 0; i <= 3; i++) {
51+
if (strides[i] == 1) {
52+
packed_dim = i;
53+
break;
54+
}
55+
}
56+
return packed_dim;
57+
}
58+
4459
//
4560
// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
4661
//
@@ -74,27 +89,49 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
7489
(buf_i / (sizes.x * sizes.y * sizes.z)));
7590
}
7691

92+
int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) {
93+
return tensor_idx.w * sizes.x * sizes.y * sizes.z +
94+
tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x;
95+
}
96+
7797
/*
7898
* Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is
7999
* packed along a texel
80-
* Returns: The (x, y, z, n) texel position corresponding to the first element
81-
* of the texel at the specified buffer index
100+
* Returns: The (w, h, c, n) tensor index corresponding to the buffer element
82101
*/
83-
ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
102+
ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) {
84103
ivec4 idx;
85104
for (int i = 3; i >= 0; i--) {
86105
if (i != packed_dim) {
87-
idx[i] = buf_i / strides[i];
88-
buf_i %= strides[i];
106+
idx[i] = buffer_id / strides[i];
107+
buffer_id %= strides[i];
89108
}
90109
}
91-
idx[packed_dim] = buf_i;
110+
idx[packed_dim] = buffer_id;
92111
return idx;
93112
}
94113

95-
int to_texel_idx(const ivec4 texel_pos, ivec4 strides) {
96-
return texel_pos.x * strides.x + texel_pos.y * strides.y +
97-
texel_pos.z * strides.z + texel_pos.w * strides.w;
114+
/*
115+
* Input: Texel buffer index, (W, H, C, N) strides of a tensor
116+
* Returns: The (w, h, c, n) tensor index corresponding to the buffer element
117+
*
118+
* This is a convenience overload of the above function. If the packed dim is
119+
* not known, it can be found by finding the first dimension with a stride of 1.
120+
* However, this process adds some overhead, so if performance is a concern then
121+
* the above function should be used instead so that the packed dim is provided.
122+
*/
123+
ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) {
124+
int packed_dim = find_packed_dim(strides);
125+
return to_tensor_idx(buffer_id, strides, packed_dim);
126+
}
127+
128+
/*
129+
* Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer
130+
* Returns: the buffer index corresponding to the specified tensor index
131+
*/
132+
int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) {
133+
return tensor_idx.x * strides.x + tensor_idx.y * strides.y +
134+
tensor_idx.z * strides.z + tensor_idx.w * strides.w;
98135
}
99136

100137
//
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
#define T ${buffer_scalar_type(DTYPE)}
6+
7+
#include "indexing_utils.h"
8+
9+
${define_required_extensions(DTYPE)}
10+
11+
layout(std430) buffer;
12+
13+
${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
14+
${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
15+
${layout_declare_ubo(2, "ivec4", "out_sizes")}
16+
${layout_declare_ubo(3, "ivec4", "out_strides")}
17+
${layout_declare_ubo(4, "int", "numel")}
18+
19+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
20+
21+
// This constant is unused in this shader but is kept so that the signature is
22+
// consistent with nchw_to_image.
23+
layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
24+
25+
void main() {
26+
int out_id = int(gl_GlobalInvocationID.x);
27+
if (out_id >= numel) {
28+
return;
29+
}
30+
31+
ivec4 out_idx = to_tensor_idx(out_id, out_strides);
32+
const int in_id = to_nchw_buffer_i(out_idx, out_sizes);
33+
34+
t_out[out_id] = nchw_in[in_id];
35+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
nchw_to_buffer:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
generate_variant_forall:
12+
DTYPE:
13+
- VALUE: half
14+
- VALUE: float
15+
- VALUE: int
16+
- VALUE: int8
17+
shader_variants:
18+
- NAME: nchw_to_buffer

backends/vulkan/runtime/graph/ops/impl/Staging.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ void add_staging_to_tensor_node(
2626

2727
vkapi::ParamsBindList ubos;
2828
if (graph.is_buffer_storage(out_tensor)) {
29-
ubos.append(graph.numel_ubo(out_tensor));
29+
ubos.append(
30+
{graph.sizes_ubo(out_tensor),
31+
graph.strides_ubo(out_tensor),
32+
graph.numel_ubo(out_tensor)});
3033
} else {
3134
ubos.append(graph.sizes_ubo(out_tensor));
3235
}
@@ -61,7 +64,10 @@ void add_tensor_to_staging_node(
6164

6265
vkapi::ParamsBindList ubos;
6366
if (graph.is_buffer_storage(in_tensor)) {
64-
ubos.append(graph.numel_ubo(in_tensor));
67+
ubos.append(
68+
{graph.sizes_ubo(in_tensor),
69+
graph.strides_ubo(in_tensor),
70+
graph.numel_ubo(in_tensor)});
6571
} else {
6672
ubos.append(graph.sizes_ubo(in_tensor));
6773
}
@@ -105,7 +111,7 @@ ValueRef prepack(
105111

106112
vkapi::ParamsBindList ubos;
107113
if (graph.is_buffer_storage(v)) {
108-
ubos.append(graph.numel_ubo(v));
114+
ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
109115
} else {
110116
ubos.append(graph.sizes_ubo(v));
111117
}

backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
107107
}
108108

109109
if (v_dst.storage_type() == utils::kBuffer) {
110-
kernel_name = "buffer_to_buffer";
110+
kernel_name = "nchw_to_buffer";
111111
add_dtype_suffix(kernel_name, v_dst);
112112
return VK_KERNEL_FROM_STR(kernel_name);
113113
}
@@ -131,7 +131,7 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
131131
}
132132

133133
if (v_src.storage_type() == utils::kBuffer) {
134-
kernel_name = "buffer_to_buffer";
134+
kernel_name = "buffer_to_nchw";
135135
add_dtype_suffix(kernel_name, v_src);
136136
return VK_KERNEL_FROM_STR(kernel_name);
137137
}

backends/vulkan/test/utils/test_utils.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,22 @@ void record_nchw_to_buffer_op(
2323
vkapi::VulkanBuffer& src_buffer,
2424
api::vTensor& v_dst) {
2525
vkapi::PipelineBarrier pipeline_barrier{};
26-
vkapi::SpecVarList specialization_constants = {
27-
SV(v_dst.packed_dim_whcn_idx())};
2826

2927
context->submit_compute_job(
3028
get_nchw_to_tensor_shader(v_dst),
3129
pipeline_barrier,
3230
{uint32_t(v_dst.numel()), 1, 1},
3331
{64, 1, 1},
34-
specialization_constants,
32+
{},
3533
VK_NULL_HANDLE,
3634
0,
3735
v_dst.buffer(
3836
pipeline_barrier,
3937
vkapi::PipelineStage::COMPUTE,
4038
vkapi::MemoryAccessType::WRITE),
4139
src_buffer,
40+
v_dst.sizes_ubo(),
41+
v_dst.strides_ubo(),
4242
v_dst.numel_ubo());
4343
}
4444

@@ -47,19 +47,18 @@ void record_buffer_to_nchw_op(
4747
api::vTensor& v_src,
4848
vkapi::VulkanBuffer& dst_buffer) {
4949
vkapi::PipelineBarrier pipeline_barrier{};
50-
vkapi::SpecVarList specialization_constants = {
51-
SV(v_src.packed_dim_whcn_idx())};
52-
5350
context->submit_compute_job(
5451
get_tensor_to_nchw_shader(v_src),
5552
pipeline_barrier,
5653
{uint32_t(v_src.numel()), 1, 1},
5754
{64, 1, 1},
58-
specialization_constants,
55+
{},
5956
VK_NULL_HANDLE,
6057
0,
6158
dst_buffer,
6259
v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
60+
v_src.sizes_ubo(),
61+
v_src.strides_ubo(),
6362
v_src.numel_ubo());
6463
}
6564

0 commit comments

Comments
 (0)