Skip to content

Commit 6f01e12

Browse files
committed
Update on "[ET-VK] Migrate ops to use DynamicDispatchNode"
## Changes * Migrate operators that are used in the llama model to use `DynamicDispatchNode` instead of `DispatchNode` ## Motivation `DynamicDispatchNode` is a subclass of `DispatchNode` that allows dynamic selection of compute shaders, global and local work group sizing whenever the command buffer is encoded. This is critical for ensuring optimum performance when input shapes are dynamic, since it allows operators to select the best compute shader for the input conditions and also to adjust global work group sizing to launch the minimum number of work groups necessary. Without this change, performance of llama 3.2 1B with dynamic shapes enabled is terrible (< 1 tok/s) because global work group sizing is determined based on maximum tensor sizes, which is based on the maximum sequence length. In practice, the sequence length dimension of tensors (even during the prefill phase) will not approach the maximum. This results in a lot of inactive threads launched during compute shader dispatches. Differential Revision: [D75878398](https://our.internmc.facebook.com/intern/diff/D75878398/) [ghstack-poisoned]
2 parents fd00dac + a155072 commit 6f01e12

33 files changed

+453
-200
lines changed

.ci/scripts/utils.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ build_executorch_runner() {
158158
cmake_install_executorch_lib() {
159159
echo "Installing libexecutorch.a and libportable_kernels.a"
160160
clean_executorch_install_folders
161-
retry cmake -DBUCK2="$BUCK" \
162-
-DCMAKE_INSTALL_PREFIX=cmake-out \
161+
retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
163162
-DCMAKE_BUILD_TYPE=Release \
164163
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
165164
-Bcmake-out .
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
14+
#define T ${buffer_scalar_type(DTYPE)}
15+
16+
${define_active_storage_type(STORAGE)}
17+
18+
#include "indexing_utils.h"
19+
20+
${define_required_extensions(DTYPE)}
21+
22+
layout(std430) buffer;
23+
24+
${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
25+
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
26+
$if STORAGE == "buffer":
27+
${layout_declare_ubo(2, "int", "numel")}
28+
$else:
29+
${layout_declare_ubo(2, "ivec3", "out_limits")}
30+
31+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
32+
33+
#include "activations.h"
34+
35+
#ifdef USING_BUFFER
36+
37+
void main() {
38+
const int i = int(gl_GlobalInvocationID.x);
39+
if (i >= numel) {
40+
return;
41+
}
42+
43+
float in_val = float(t_in[i]);
44+
t_out[i] = T(tan(in_val));
45+
}
46+
47+
#else
48+
49+
void main() {
50+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
51+
52+
if (any(greaterThanEqual(pos, out_limits))) {
53+
return;
54+
}
55+
56+
VEC4_T in_texel = texelFetch(t_in, pos, 0);
57+
imageStore(t_out, pos, VEC4_T(tan(in_texel)));
58+
}
59+
60+
#endif
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
tan:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
STORAGE: texture3d
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
- VALUE: float
9+
STORAGE:
10+
- VALUE: texture3d
11+
- VALUE: buffer
12+
shader_variants:
13+
- NAME: tan

backends/vulkan/runtime/graph/ops/impl/Clone.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,17 @@ void add_clone_node(
6161
resize_clone_node));
6262
}
6363

64+
utils::uvec3 clone_image_to_buffer_global_wg_size(
65+
ComputeGraph* graph,
66+
const vkapi::ShaderInfo& shader,
67+
const std::vector<ArgGroup>& args,
68+
const std::vector<ValueRef>& resize_args) {
69+
(void)shader;
70+
(void)resize_args;
71+
const ValueRef image = args.at(1).refs.at(0);
72+
return graph->create_global_wg_size(image);
73+
}
74+
6475
void add_image_to_buffer_node(
6576
ComputeGraph& graph,
6677
const ValueRef image,
@@ -72,7 +83,7 @@ void add_image_to_buffer_node(
7283
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
7384
graph,
7485
shader,
75-
default_pick_global_wg_size,
86+
clone_image_to_buffer_global_wg_size,
7687
default_pick_local_wg_size,
7788
// Input and Outputs
7889
{{buffer, vkapi::kWrite}, {image, vkapi::kRead}},

backends/vulkan/runtime/graph/ops/impl/Common.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ utils::uvec3 default_pick_global_wg_size(
1414
ComputeGraph* graph,
1515
const vkapi::ShaderInfo& shader,
1616
const std::vector<ArgGroup>& args,
17-
const std::vector<ValueRef>& additional_args) {
17+
const std::vector<ValueRef>& resize_args) {
1818
(void)shader;
19+
(void)resize_args;
1920
const ValueRef out = args.at(0).refs.at(0);
2021
return graph->create_global_wg_size(out);
2122
}
@@ -25,8 +26,10 @@ utils::uvec3 default_pick_local_wg_size(
2526
const vkapi::ShaderInfo& shader,
2627
const utils::uvec3& global_workgroup_size,
2728
const std::vector<ArgGroup>& args,
28-
const std::vector<ValueRef>& additional_args) {
29+
const std::vector<ValueRef>& resize_args) {
2930
(void)shader;
31+
(void)args;
32+
(void)resize_args;
3033
return graph->create_local_wg_size(global_workgroup_size);
3134
}
3235

backends/vulkan/runtime/graph/ops/impl/Common.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,23 @@ namespace vkcompute {
1717
* Creates a global workgroup size based on the first output tensor in the args.
1818
* This is a utility function that extracts the output tensor from
1919
* args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it.
20-
*
21-
* @param graph The ComputeGraph instance
22-
* @param args Vector of ArgGroup containing the output tensor reference
23-
* @return utils::uvec3 The global workgroup size
2420
*/
2521
utils::uvec3 default_pick_global_wg_size(
2622
ComputeGraph* graph,
2723
const vkapi::ShaderInfo& shader,
2824
const std::vector<ArgGroup>& args,
29-
const std::vector<ValueRef>& additional_args);
25+
const std::vector<ValueRef>& resize_args);
3026

3127
/**
3228
* Creates a local workgroup size based on the first output tensor in the args.
3329
* This is a utility function that extracts the output tensor from
3430
* args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it.
35-
*
36-
* @param graph The ComputeGraph instance
37-
* @param args Vector of ArgGroup containing the output tensor reference
38-
* @return utils::uvec3 The local workgroup size
3931
*/
4032
utils::uvec3 default_pick_local_wg_size(
4133
ComputeGraph* graph,
4234
const vkapi::ShaderInfo& shader,
4335
const utils::uvec3& global_workgroup_size,
4436
const std::vector<ArgGroup>& args,
45-
const std::vector<ValueRef>& additional_args);
37+
const std::vector<ValueRef>& resize_args);
4638

4739
} // namespace vkcompute
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10+
11+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
12+
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
13+
14+
namespace vkcompute {
15+
16+
using namespace utils;
17+
18+
void resize_tan_node(
19+
ComputeGraph* graph,
20+
const std::vector<ArgGroup>& args,
21+
const std::vector<ValueRef>& extra_args) {
22+
(void)extra_args;
23+
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
24+
vTensorPtr self = graph->get_tensor(args[1].refs[0]);
25+
26+
out->virtual_resize(self->sizes());
27+
}
28+
29+
void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
30+
std::string kernel_name = "tan";
31+
add_dtype_suffix(kernel_name, graph.dtype_of(out));
32+
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
33+
34+
vkapi::ParamsBindList ubos({});
35+
ubos.append({graph.logical_limits_ubo(out)});
36+
37+
graph.execute_nodes().emplace_back(new DispatchNode(
38+
graph,
39+
VK_KERNEL_FROM_STR(kernel_name),
40+
graph.create_global_wg_size(out),
41+
graph.create_local_wg_size(out),
42+
// Inputs and Outputs
43+
{{out, vkapi::kWrite}, {in, vkapi::kRead}},
44+
// Shader params buffers
45+
ubos,
46+
// Push Constants
47+
{},
48+
// Specialization Constants
49+
{},
50+
// Resize Args
51+
{},
52+
// Resizing Logic
53+
resize_tan_node));
54+
}
55+
56+
void tan(ComputeGraph& graph, const std::vector<ValueRef>& args) {
57+
return add_tan_node(graph, args[0], args[1]);
58+
}
59+
60+
REGISTER_OPERATORS {
61+
VK_REGISTER_OP(aten.tan.default, tan);
62+
}
63+
64+
} // namespace vkcompute

backends/vulkan/test/op_tests/cases.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,6 +1171,22 @@ def get_unary_ops_inputs():
11711171
return test_suite
11721172

11731173

1174+
# separate test suite from unary_ops for learning purposes
1175+
@register_test_suite("aten.tan.default")
1176+
def get_tan_inputs():
1177+
test_suite = VkTestSuite(
1178+
[
1179+
(M1,),
1180+
(M1, M2),
1181+
(S1, M1, M2),
1182+
(S1, S2, S2, M2),
1183+
]
1184+
)
1185+
test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
1186+
test_suite.dtypes = ["at::kFloat", "at::kHalf"]
1187+
return test_suite
1188+
1189+
11741190
@register_test_suite("aten._native_batch_norm_legit_no_training.default")
11751191
def get_native_batch_norm_inputs():
11761192
Test = namedtuple(

backends/vulkan/test/utils/test_utils.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,8 @@ vkcompute::ComputeGraph build_mm_graph(
547547
vkcompute::vkapi::ScalarType dtype,
548548
vkcompute::utils::StorageType in_out_stype,
549549
vkcompute::utils::GPUMemoryLayout memory_layout,
550-
const bool prepack_mat2,
551-
const float mat2_val) {
550+
const std::vector<float>& mat2_data,
551+
const bool prepack_mat2) {
552552
using namespace vkcompute;
553553
GraphConfig config;
554554
ComputeGraph graph(config);
@@ -569,10 +569,7 @@ vkcompute::ComputeGraph build_mm_graph(
569569
graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout);
570570
IOValueRef mat2{};
571571

572-
CREATE_RAND_WEIGHT_TENSOR(mat2_w, mat2_size, dtype);
573-
if (mat2_val != 0.0f) {
574-
std::fill(data_mat2_w.begin(), data_mat2_w.end(), mat2_val);
575-
}
572+
ValueRef mat2_w = graph.add_tensorref(mat2_size, dtype, mat2_data.data());
576573

577574
if (prepack_mat2) {
578575
mat2.value = mat2_w;

backends/vulkan/test/utils/test_utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,8 @@ vkcompute::ComputeGraph build_mm_graph(
265265
vkcompute::vkapi::ScalarType dtype,
266266
vkcompute::utils::StorageType in_out_stype,
267267
vkcompute::utils::GPUMemoryLayout memory_layout,
268-
const bool prepack_mat2 = false,
269-
const float mat2_val = 0.0f);
268+
const std::vector<float>& mat2_data,
269+
const bool prepack_mat2 = false);
270270

271271
//
272272
// Debugging Utilities

0 commit comments

Comments
 (0)