Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,6 @@ def register_view_op(features: OpFeatures):
exir_ops.edge.aten.flip.default,
exir_ops.edge.aten.index_select.default,
exir_ops.edge.aten.select_copy.int,
# Tensor combination
exir_ops.edge.aten.repeat.default,
# Tensor creation
exir_ops.edge.aten.arange.start_step,
exir_ops.edge.aten.clone.default,
Expand Down Expand Up @@ -561,6 +559,7 @@ def register_ported_op(features: OpFeatures):
exir_ops.edge.aten.permute_copy.default,
# Tensor combination
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
exir_ops.edge.aten.split.Tensor,
]
Expand Down
147 changes: 14 additions & 133 deletions backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ void check_args(
const api::vTensor& in,
const std::vector<int64_t>& repeats,
const api::vTensor& out) {
VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
VK_CHECK_COND(check_same_packed_dim(in, out));

VK_CHECK_COND(in.storage_type() == out.storage_type());
if (in.storage_type() == utils::kTexture2D) {
Expand Down Expand Up @@ -59,147 +58,29 @@ void check_args(

} // namespace

void add_repeat_channel_node(
ComputeGraph& graph,
ValueRef in,
int64_t repeat_channel,
ValueRef out,
utils::ivec3& running_range) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

std::string kernel_name = "repeat_channel";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

const std::vector<int64_t>& in_sizes = t_in->sizes();

int32_t in_width = utils::safe_downcast<int32_t>(dim_at<kWidth4D>(in_sizes));
int32_t in_height =
utils::safe_downcast<int32_t>(dim_at<kHeight4D>(in_sizes));
int32_t in_channel =
utils::safe_downcast<int32_t>(dim_at<kChannel4D>(in_sizes));
int32_t in_batch = utils::safe_downcast<int32_t>(dim_at<kBatch4D>(in_sizes));

int32_t out_channel = repeat_channel * in_channel;

utils::ivec4 out_whcn_sizes{in_width, in_height, out_channel, in_batch};

utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch};

// Channel packed global work ids
running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]);
utils::uvec3 global_size = utils::make_uvec3(running_range);
utils::uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
utils::ivec4 out_sizes;
utils::ivec4 in_size;
} repeat_channel_args{
out_whcn_sizes,
in_whcn_sizes,
};

auto shader = VK_KERNEL_FROM_STR(kernel_name);

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{in, vkapi::MemoryAccessType::READ}},
// Parameter buffers
{graph.create_params_buffer(repeat_channel_args)},
// Specialization Constants
{SV(t_out->packed_dim())}));
}

void add_repeat_node(
ComputeGraph& graph,
ValueRef in,
ValueRef repeats_ref,
ValueRef out) {
std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));

vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);
check_args(*t_in, repeats, *t_out);

// In this function, we expand the dimensions in the following order:
// 1. Channel
// 2. Width
// 3. Height
// 4. Batch
// After expanding a dimension, we will update the "running_range" since we
// will need to copy the "expanded" area.

utils::ivec3 running_range = t_in->logical_limits();

const std::vector<int64_t>& in_sizes = t_in->sizes();

// Since we use channel packing, repeating the channel dimension is the most
// complicated and time-consuming, as we need to reason over misaligned
// channels. Hence we expand it first to minimize cost. Also, in this first
// dimension, we copy over the input texure to the output. In subsequent
// dimensions, we read and write from the same tensor.

if (int64_t channel_repeat = dim_at<kChannel4D>(repeats);
channel_repeat == 1) {
// If no repeat, short-cut to a direct copy
utils::ivec4 src_offset{0, 0, 0, 0};
utils::ivec4 dst_offset{0, 0, 0, 0};

add_copy_offset_node(
graph, in, running_range, src_offset, dst_offset, out, false, false);

} else {
add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
}

// TODO: refactor width, height, and batch into a common helper function.
// Width
if (int64_t width_repeat = dim_at<kWidth4D>(repeats); width_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < width_repeat; ++i) {
utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[0] = running_range[0] * width_repeat;
}

// Height
if (int64_t height_repeat = dim_at<kHeight4D>(repeats); height_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < height_repeat; ++i) {
utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[1] = running_range[1] * height_repeat;
}

// Batch
if (int64_t batch_repeat = dim_at<kBatch4D>(repeats); batch_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < batch_repeat; ++i) {
utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[2] = running_range[2] * batch_repeat;
}
const utils::ivec4 src_offset{
dim_at<kWidth4D>(t_in->sizes()),
dim_at<kHeight4D>(t_in->sizes()),
dim_at<kChannel4D>(t_in->sizes()),
dim_at<kBatch4D>(t_in->sizes())};
const utils::ivec4 dst_offset{
dim_at<kWidth4D>(repeats),
dim_at<kHeight4D>(repeats),
dim_at<kChannel4D>(repeats),
dim_at<kBatch4D>(repeats)};
add_copy_packed_dim_offset_node(
graph, in, t_out->logical_limits(), src_offset, dst_offset, out, true);
}

void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
Expand Down
12 changes: 10 additions & 2 deletions backends/vulkan/test/op_tests/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,11 @@ def get_repeat_inputs():
((2, 3), [3, 1, 4]),
]
)
test_suite_2d.layouts = ["utils::kChannelsPacked"]
test_suite_2d.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
test_suite_2d.storage_types = ["utils::kTexture2D"]
test_suite_2d.data_gen = "make_seq_tensor"
test_suite_2d.dtypes = ["at::kFloat"]
Expand Down Expand Up @@ -795,7 +799,11 @@ def get_repeat_inputs():
((2, 3), [3, 3, 2, 4]),
]
)
test_suite_3d.layouts = ["utils::kChannelsPacked"]
test_suite_3d.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
test_suite_3d.storage_types = ["utils::kTexture3D"]
test_suite_3d.data_gen = "make_seq_tensor"
test_suite_3d.dtypes = ["at::kFloat"]
Expand Down
Loading