Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures):
exir_ops.edge.aten.index_select.default,
exir_ops.edge.aten.select_copy.int,
# Tensor combination
exir_ops.edge.aten.split_with_sizes_copy.default,
exir_ops.edge.aten.split.Tensor,
exir_ops.edge.aten.repeat.default,
# Tensor creation
exir_ops.edge.aten.arange.start_step,
Expand Down Expand Up @@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures):
exir_ops.edge.aten.permute_copy.default,
# Tensor combination
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
exir_ops.edge.aten.split.Tensor,
]
)
def register_ported_op_all_packed_dims(features: OpFeatures):
Expand Down
20 changes: 14 additions & 6 deletions backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,29 @@ const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

${layout_declare_spec_const(C, "int", "batch_index_function", "0")}

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range))) {
return;
}

const ivec3 in_pos = pos + src_offset.xyz;
ivec3 in_pos = pos + src_offset.xyz;
ivec3 out_pos = pos + dst_offset.xyz;

// If source channel size is specified compose output z based on channel and batch index
if (src_offset.w > 0) {
const int channel_index = in_pos.z % src_offset.w;
const int batch_index = in_pos.z / src_offset.w;
out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
if (batch_index_function == 1) {
// batch index is calculated using source channel size
const int channel_index = pos.z % src_offset.w;
const int batch_index = pos.z / src_offset.w;
out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
} else if (batch_index_function == 2) {
// batch index is calculated using destination channel size
const int channel_index = pos.z % dst_offset.w;
const int batch_index = pos.z / dst_offset.w;
in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
}
}

write_texel_lpos(
Expand Down
48 changes: 41 additions & 7 deletions backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,57 @@ void main() {
return;
}

// Starting offset to write at within a texel
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
const bool has_lane_offset = out_lane_offset != 0;

// Position in input tensor
const ivec3 in_pos = pos + src_offset.xyz;
ivec3 in_pos = pos + src_offset.xyz;
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);

// Read input value mapping to this output texel
const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);

// Starting offset to read from a texel
const int src_lane_offset = src_offset[packed_dim] & 0x3;
const bool has_src_lane_offset = src_lane_offset != 0;

// If input lane offset is non zero i.e packed texel is composed from multiple sources
if (has_src_lane_offset) {
// Boundary values will come from next input texel in the packed dim.
ivec3 next_in_pos = in_pos;
next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);

// Keep input values from the end of current input pixel based on src_lane_offset
// offset 1 means the first lane of current input texel is not a part of the output texel
// offset 2 means first 2 lanes are not and so on
if (src_lane_offset == 1) {
in_value.xyz = in_value.yzw;
} else if (src_lane_offset == 2) {
in_value.xy = in_value.zw;
} else {
in_value.x = in_value.w;
}
// Copy next texel's values towards the end of input texel, based on lane offset
// offset 1 means the first lane from next texel is part of the input texel
// offset 2 means first 2 lanes from next texel is part of the input texel and so on
if (src_lane_offset == 1) {
in_value.w = next_value.x;
} else if (src_lane_offset == 2) {
in_value.zw = next_value.xy;
} else {
in_value.yzw = next_value.xyz;
}
}

// Starting offset to write at within a texel
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
const bool has_dst_lane_offset = out_lane_offset != 0;

ivec3 out_pos = pos + dst_offset.xyz;
out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);

VEC4_T out_value;

// If lane offset is non zero i.e packed texel is composed from multiple sources
if (has_lane_offset) {
if (has_dst_lane_offset) {
// When position in packed dim is > 0
if (pos[packed_dim] > 0) {
// Boundary values will come from previous input texel in the packed dim.
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/impl/Cat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void add_cat_default_node(
// concatenating channels
src_offset[3] = is_concat_channel ? in_channel_size : 0;
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
graph, input_ref, range, src_offset, dst_offset, out, true, false);
dst_offset[dim_xyz_index] +=
is_concat_channel ? in_channel_size : range[dim_xyz_index];
}
Expand Down
41 changes: 33 additions & 8 deletions backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ void add_copy_offset_node(
const ivec3& range,
const ivec4& src_offset,
const ivec4& dst_offset,
const ValueRef out) {
const ValueRef out,
bool calc_out_pos_using_src_chnl,
bool calc_in_pos_using_dst_chnl) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

Expand All @@ -49,7 +51,11 @@ void add_copy_offset_node(
// Parameter buffers
{},
// Specialization Constants
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
{graph.hashed_layout_of(out),
graph.hashed_layout_of(in),
(calc_out_pos_using_src_chnl ? 1
: calc_in_pos_using_dst_chnl ? 2
: 0)},
nullptr,
{},
{
Expand Down Expand Up @@ -86,19 +92,37 @@ void add_copy_packed_dim_offset_node(
ivec4 final_range = {
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
ivec3 global_wg_size = t_out->logical_limits();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to remain
// lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
// size in packed dimension
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
// tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting offset is not 0, and the total packed texels is greater
// If the starting src offset is not 0, and the total packed texels is greater
// than the source texel range
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is greater
// than the source texel range
if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension
Expand Down Expand Up @@ -256,7 +280,8 @@ void add_copy_offset_node(
ivec4 src_offset = {src[0], src[1], src[2], 0};
ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};

add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
add_copy_offset_node(
graph, in, range, src_offset, dst_offset, out, false, false);
}

void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
Expand Down
17 changes: 16 additions & 1 deletion backends/vulkan/runtime/graph/ops/impl/Copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,28 @@ namespace vkcompute {
// It is possible to have input and output to point to the same image
// object. But when the source range and destination range overlap, the behavior
// is undefined.
//
// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
// can be used to specify an indexing function in the shader
// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
// calculated based on source channel size and will be used to determine
// destination texel position.
//
// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
// calculated based on destination channel size and will be used to determine
// source texel position.
//
// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
// index calculation happens.
void add_copy_offset_node(
ComputeGraph& graph,
const ValueRef in,
const utils::ivec3& range,
const utils::ivec4& src_offset,
const utils::ivec4& dst_offset,
const ValueRef out);
const ValueRef out,
bool calc_out_pos_using_src_chnl,
bool calc_in_pos_using_dst_chnl);

// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
// its used when copying packed dimension, if tensor is width or height packed.
Expand Down
9 changes: 5 additions & 4 deletions backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ void add_repeat_node(
utils::ivec4 src_offset{0, 0, 0, 0};
utils::ivec4 dst_offset{0, 0, 0, 0};

add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out);
add_copy_offset_node(
graph, in, running_range, src_offset, dst_offset, out, false, false);

} else {
add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
Expand All @@ -166,7 +167,7 @@ void add_repeat_node(
utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out);
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[0] = running_range[0] * width_repeat;
Expand All @@ -180,7 +181,7 @@ void add_repeat_node(
utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out);
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[1] = running_range[1] * height_repeat;
Expand All @@ -194,7 +195,7 @@ void add_repeat_node(
utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out);
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[2] = running_range[2] * batch_repeat;
Expand Down
91 changes: 45 additions & 46 deletions backends/vulkan/runtime/graph/ops/impl/Split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
ValueRef out_list_ref) {
vTensorPtr t_in = graph.get_tensor(in);

VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));

ValueListPtr out_list = graph.get_value_list(out_list_ref);

DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
Expand All @@ -38,59 +36,60 @@ void add_split_with_sizes_default_node(
ValueRef out_ref = (*out_list)[split_idx];

vTensorPtr t_out = graph.get_tensor(out_ref);
VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
}

if (dim_index == kWidth4D) {
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);

for (ValueRef out_ref : *out_list) {
// Doesn't need to use split_size since we have already verified that the
// output tensor's size matches with the split_size.
vTensorPtr t_out = graph.get_tensor(out_ref);
utils::ivec3 range = t_out->logical_limits();
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
const auto packed_dim = t_in->packed_dim();
const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);

src_offset[0] += range[0];
}
} else if (dim_index == kHeight4D) {
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
// Index of dimension to be concatenated in (w, h, c * b) coordinate system
const auto dim_xyz_index = std::min(2, -dim_index - 1);

for (ValueRef out_ref : *out_list) {
vTensorPtr t_out = graph.get_tensor(out_ref);
utils::ivec3 range = t_out->logical_limits();
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);

src_offset[1] += range[1];
}
} else if (dim_index == kBatch4D) {
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
const bool is_splitting_channel = (dim_index == kChannel4D);

for (ValueRef out_ref : *out_list) {
vTensorPtr t_out = graph.get_tensor(out_ref);
utils::ivec3 range = t_out->logical_limits();
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
// if splitting channels
if (is_splitting_channel) {
// set source offset w as channel size of the input tensor
src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
}

src_offset[2] += range[2];
}
} else if (dim_index == kChannel4D) {
int32_t src_offset = 0;
int32_t dst_offset = 0;

for (ValueRef out_ref : *out_list) {
vTensorPtr t_out = graph.get_tensor(out_ref);
int32_t range = dim_at<kChannel4D>(t_out->sizes());
add_copy_channel_offset_node(
graph, in, range, src_offset, dst_offset, out_ref);
src_offset += range;
for (ValueRef out_ref : *out_list) {
// Doesn't need to use split_size since we have already verified that the
// output tensor's size matches with the split_size.
vTensorPtr t_out = graph.get_tensor(out_ref);
const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
utils::ivec3 range = t_out->logical_limits();

if (dim_index == packed_dim_index) {
// if splitting channels, use add_copy_channel_offset_node function as
// add_copy_packed_dim_offset_node does not support channel packing
if (is_splitting_channel) {
add_copy_channel_offset_node(
graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
src_offset[dim_xyz_index] += out_channel_size;
} else {
// dst_offset[3] is not used now but will be used in the future when
// add_copy_packed_dim_offset_node will support channel packing
//
// set destination offset w as channel size of the output tensor if
// splitting channel
dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
add_copy_packed_dim_offset_node(
graph, in, range, src_offset, dst_offset, out_ref);
src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
}
} else {
// set destination offset w as channel size of the output tensor if
// splitting channels
dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
add_copy_offset_node(
graph, in, range, src_offset, dst_offset, out_ref, false, true);
src_offset[dim_xyz_index] +=
is_splitting_channel ? out_channel_size : range[dim_xyz_index];
}

} else {
VK_THROW("not ipmlemented");
}
}

Expand Down
Loading
Loading