Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
return idx;
}

ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
for (int i = 0; i < values_.size(); ++i) {
if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
return i;
}
}
return add_scalar(val);
}

ValueRef ComputeGraph::set_input_tensor(
const ValueRef idx,
const bool use_staging) {
Expand Down
7 changes: 7 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,13 @@ class ComputeGraph final {

ValueRef add_symint(const int32_t val);

/*
* Searches the graph's value list for a Int value with the specified value.
* If one is found, returns the index of the value. Otherwise, add a new value
* and return the index of the new value.
*/
ValueRef get_or_add_value_for_int(const int64_t val);

ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);

Expand Down
17 changes: 9 additions & 8 deletions backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
Expand All @@ -30,8 +31,8 @@ void check_binary_op_args(
void resize_binary_op_node(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& extra_args) {
(void)extra_args;
const std::vector<ValueRef>& resize_args) {
(void)resize_args;
vTensorPtr out = graph->get_tensor(args[0].refs[0]);

// TODO(T183442143): Verify tensors are broadcastable.
Expand Down Expand Up @@ -78,11 +79,11 @@ void add_binary_op_texture_node(
add_storage_type_suffix(kernel_name, *t_out);
add_dtype_suffix(kernel_name, *t_out);

graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
graph.create_global_wg_size(out),
graph.create_local_wg_size(out),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {{arg1, arg2}, vkapi::kRead}},
// Shader params buffers
Expand Down Expand Up @@ -122,11 +123,11 @@ void add_binary_op_buffer_node(
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
add_dtype_suffix(kernel_name, graph.dtype_of(out));

graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
graph.create_global_wg_size(out),
graph.create_local_wg_size(out),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}},
// Shader params buffers
Expand Down
25 changes: 12 additions & 13 deletions backends/vulkan/runtime/graph/ops/impl/Clone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <executorch/backends/vulkan/runtime/graph/Logging.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
Expand All @@ -21,8 +22,8 @@ namespace vkcompute {
void resize_clone_node(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& extra_args) {
(void)extra_args;
const std::vector<ValueRef>& resize_args) {
(void)resize_args;
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
vTensorPtr in = graph->get_tensor(args[1].refs[0]);
// TODO: support for when dimensionality doesn't match, i.e. clone is used to
Expand All @@ -41,11 +42,11 @@ void add_clone_node(
std::string kernel_name = "clone";
add_dtype_suffix(kernel_name, *t_out);

graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
graph.create_global_wg_size(out),
graph.create_local_wg_size(out),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {in, vkapi::kRead}},
// Parameter Buffers
Expand All @@ -68,12 +69,11 @@ void add_image_to_buffer_node(
add_dtype_suffix(kernel_name, graph.dtype_of(image));
vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);

utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
shader,
global_wg_size,
graph.create_local_wg_size(global_wg_size),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Input and Outputs
{{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
// Parameter Buffers
Expand All @@ -96,12 +96,11 @@ void add_buffer_to_image_node(
add_dtype_suffix(kernel_name, graph.dtype_of(image));
vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);

utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
shader,
global_wg_size,
graph.create_local_wg_size(global_wg_size),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Input and Outputs
{{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
// Parameter Buffers
Expand Down
161 changes: 100 additions & 61 deletions backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>

Expand Down Expand Up @@ -37,12 +38,12 @@ void check_matmul_args(
void resize_matmul_node(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& extra_args) {
const std::vector<ValueRef>& resize_args) {
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);

bool mat2_is_transposed = graph->get_bool(extra_args[0]);
bool mat2_is_transposed = graph->get_bool(resize_args[0]);

const int out_cols = utils::val_at(-2, mat1->sizes());
const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2->sizes())
Expand All @@ -56,6 +57,22 @@ void resize_matmul_node(
out->virtual_resize(new_out_sizes);
}

/**
* Custom global workgroup size function for naive buffer matmul operations.
*/
utils::uvec3 matmul_naive_buffer_global_wg_size(
ComputeGraph* graph,
const vkapi::ShaderInfo& shader,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& resize_args) {
(void)shader;
const ValueRef out = args.at(0).refs.at(0);
return {
graph->size_at<uint32_t>(-1, out),
graph->size_at<uint32_t>(-2, out),
graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
}

void add_matmul_naive_buffer_node(
ComputeGraph& graph,
const ValueRef mat1,
Expand All @@ -72,21 +89,16 @@ void add_matmul_naive_buffer_node(
std::string kernel_name = "matmul_naive_buffer";
add_dtype_suffix(kernel_name, graph.dtype_of(out));

utils::uvec3 global_size = {
graph.size_at<uint32_t>(-1, out),
graph.size_at<uint32_t>(-2, out),
graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};

int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef &&
graph.get_bool(mat2_is_transposed))
? 1
: 0;

graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
graph.create_local_wg_size(global_size),
matmul_naive_buffer_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
// Shader params buffers
Expand All @@ -109,6 +121,22 @@ void add_matmul_naive_buffer_node(
resize_matmul_node));
}

vkapi::ShaderInfo pick_matmul_naive_texture3d_shader(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& resize_args) {
const ValueRef out = args.at(0).refs.at(0);
const bool is_transposed = graph->get_bool(resize_args.at(0));

std::string kernel_name =
is_transposed ? "matmul_transposed_naive" : "matmul_naive";
kernel_name.reserve(kShaderNameReserve);
add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
add_dtype_suffix(kernel_name, graph->dtype_of(out));

return VK_KERNEL_FROM_STR(kernel_name);
}

void add_matmul_naive_texture3d_node(
ComputeGraph& graph,
const ValueRef mat1,
Expand All @@ -122,19 +150,11 @@ void add_matmul_naive_texture3d_node(
utils::kHeightPacked,
/*passthrough = */ true);

std::string kernel_name = graph.get_bool(mat2_is_transposed)
? "matmul_transposed_naive"
: "matmul_naive";
kernel_name.reserve(kShaderNameReserve);
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
add_dtype_suffix(kernel_name, graph.dtype_of(out));

utils::uvec3 global_wg_size = graph.logical_limits_of(out);
graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_wg_size,
graph.create_local_wg_size(global_wg_size),
pick_matmul_naive_texture3d_shader,
default_pick_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
// Shader params buffers
Expand All @@ -156,6 +176,59 @@ void add_matmul_naive_texture3d_node(
resize_matmul_node));
}

vkapi::ShaderInfo pick_matmul_optimized_shader(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& resize_args) {
const ValueRef out = args.at(0).refs.at(0);
const ValueRef mat1_W_packed = resize_args.at(1);
const bool mat2_is_transposed_val = graph->get_bool(resize_args.at(0));

std::string kernel_name = mat2_is_transposed_val
? "matmul_transposed_optimized"
: "matmul_optimized";

std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
int mat1_dims = mat1_sizes.size();
if (mat1_dims == 3) {
kernel_name = "batch_" + kernel_name;
}
if (mat1_sizes.at(mat1_dims - 2) < 8) {
kernel_name += "_tile_row_2";
} else {
kernel_name += "_tile_row_4";
}

add_dtype_suffix(kernel_name, graph->dtype_of(out));

return VK_KERNEL_FROM_STR(kernel_name);
}

utils::uvec3 matmul_optimized_global_wg_size(
ComputeGraph* graph,
const vkapi::ShaderInfo& shader,
const std::vector<ArgGroup>& args,
const std::vector<ValueRef>& resize_args) {
(void)shader;

const ValueRef out = args.at(0).refs.at(0);
const ValueRef mat1_W_packed = resize_args.at(1);

const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
const int mat1_dims = mat1_sizes.size();

utils::uvec3 global_size = graph->logical_limits_of(out);
if (mat1_sizes.at(mat1_dims - 2) < 8) {
// Use `logical_extents` instead of `image_extents` because the workgroup
// axes need to correspond to tensor dimensions.
global_size = utils::divup_vec(global_size, {4, 2, 1});
} else {
global_size = utils::divup_vec(global_size, {4, 4, 1});
}

return global_size;
}

void add_matmul_optimized_node(
ComputeGraph& graph,
const ValueRef mat1,
Expand Down Expand Up @@ -192,45 +265,11 @@ void add_matmul_optimized_node(
viewFn(graph, {mat2, graph.add_none(), mat2_packed});
}

std::string kernel_name = mat2_is_transposed_val
? "matmul_transposed_optimized"
: "matmul_optimized";

std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1_W_packed);
int mat1_dims = mat1_sizes.size();
if (mat1_dims == 3) {
kernel_name = "batch_" + kernel_name;
}
if (mat1_sizes.at(mat1_dims - 2) < 8) {
kernel_name += "_tile_row_2";
} else {
kernel_name += "_tile_row_4";
}

add_dtype_suffix(kernel_name, graph.dtype_of(out));

// Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
// total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
// channels packed, C does not need to be divided by 4. The "identity" of each
// thread is the (x, y, z) coordinate of the output tile it is computing, and
// this identity can be used to compute the tensor index of the top left
// element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
utils::uvec3 global_size = graph.logical_limits_of(out);
if (mat1_sizes.at(mat1_dims - 2) < 8) {
// Use `logical_extents` instead of `image_extents` because the workgroup
// axes need to correspond to tensor dimensions.
global_size = utils::divup_vec(global_size, {4, 2, 1});
} else {
global_size = utils::divup_vec(global_size, {4, 4, 1});
}

utils::uvec3 local_size = adaptive_work_group_size(global_size);

graph.execute_nodes().emplace_back(new DispatchNode(
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
pick_matmul_optimized_shader,
matmul_optimized_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{out, vkapi::kWrite}, {{mat1_W_packed, mat2_packed}, vkapi::kRead}},
// Shader params buffers
Expand All @@ -246,7 +285,7 @@ void add_matmul_optimized_node(
graph.hashed_layout_of(mat1_W_packed),
graph.hashed_layout_of(mat2_packed)},
// Resize Args
{mat2_is_transposed},
{mat2_is_transposed, mat1_W_packed},
// Resizing Logic
resize_matmul_node));
}
Expand Down
Loading
Loading