Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void main() {
div_by_x % out_limits.y,
div_by_x / out_limits.y);

if (any(greaterThanEqual(pos, out_limits))) {
if (pos.z >= out_limits.z) {
return;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void main() {
pos.y *= BATCH_SIZE_Y;

// do not process if top pixel does not fit within the output range
if (any(greaterThanEqual(pos, out_limits))) {
if (pos.z >= out_limits.z) {
return;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void main() {
div_by_x % out_limits.y,
div_by_x / out_limits.y);

if (any(greaterThanEqual(pos, out_limits))) {
if (pos.z >= out_limits.z) {
return;
}

Expand Down
53 changes: 31 additions & 22 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

#define VEC4_T ${texel_type(DTYPE)}

#define TILE_SIZE ${TILE_SIZE}
#define TILE_SIZE_X ${TILE_SIZE_X}
#define TILE_SIZE_Y ${TILE_SIZE_Y}

#define op(X, A, B) ${OPERATOR}

Expand All @@ -24,17 +25,26 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
${layout_declare_ubo(4, "ivec3", "out_limits")}
${layout_declare_ubo(5, "ivec4", "in_sizes")}
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}

layout(push_constant) uniform restrict Block {
ivec4 out_limits;
ivec4 in_sizes;
ivec2 kernel_size;
ivec2 stride;
ivec2 padding;
ivec2 dilation;
ivec2 overlay_region;
int in_group_size;
int dummy_padding;
float out_min;
float out_max;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
// 64 is the number of threads in the local wg
$num_shared = 64 * TILE_SIZE * TILE_SIZE
$num_shared = 64 * TILE_SIZE_X * TILE_SIZE_Y
shared ivec2 pos_shared[${num_shared}];

/*
Expand All @@ -43,8 +53,8 @@ shared ivec2 pos_shared[${num_shared}];
* size is only 1x1, making it easier to re-use loaded texels from t_kernel.
*/
void main() {
const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
const uint shared_mem_stride = 64;

const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
const ivec3 gpos = ivec3(
Expand All @@ -58,33 +68,32 @@ void main() {
// +--------+--------+
// | pos[2] | pos[3] |
// +--------+--------+
ivec2 pos[TILE_SIZE * TILE_SIZE];
for (int y = 0, i = 0; y < TILE_SIZE; ++y) {
for (int x = 0; x < TILE_SIZE; ++x) {
pos[i] = ivec2(
gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y];
for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
for (int x = 0; x < TILE_SIZE_X; ++x) {
pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y);
pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
i++;
}
}

// If the top left position is out of bounds, then this invocation will have
// no work to do.
if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) {
if (gpos.z >= out_limits.z) {
return;
}

// Compute the index of the input texture that needs to be loaded for each
// output position. Note that negative indices can be produced indicating that
// the top-left element is in a region added by padding.
ivec2 ipos[TILE_SIZE * TILE_SIZE];
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
ipos[i] = pos[i] * stride - padding;
}

vec4 sum[TILE_SIZE * TILE_SIZE];
vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
sum[i] = sum[0];
}

Expand All @@ -100,7 +109,7 @@ void main() {
const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));

#pragma unroll
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
// For 2x2 tile size algorithm works as follows.
// To explain the calculations below, the contents of one in_tex and the
Expand Down Expand Up @@ -142,9 +151,9 @@ void main() {
}
}

for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
if (all(lessThan(ivec3(pos, gpos.z), out_limits))) {
if (all(lessThan(ivec3(pos, gpos.z), out_limits.xyz))) {
imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max));
}
}
Expand Down
3 changes: 2 additions & 1 deletion backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ conv2d_pw:
OPERATOR: X
NDIM: 3
DTYPE: float
TILE_SIZE: 2
TILE_SIZE_X: 2
TILE_SIZE_Y: 2
generate_variant_forall:
DTYPE:
- VALUE: half
Expand Down
83 changes: 62 additions & 21 deletions backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,27 +407,68 @@ void add_conv2d_node(
wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
}

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{
t_out->logical_limits_ubo(),
t_in->sizes_ubo(),
graph.create_params_buffer(kernel_params),
graph.create_params_buffer(extra_params),
graph.create_params_buffer(out_params),
},
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding}));
if (method == Conv2dMethod::Pointwise) {
const utils::ivec4 kernel_param_size_stride = {
kernel_params.kernel_size[0],
kernel_params.kernel_size[1],
kernel_params.stride[0],
kernel_params.stride[1]};

const utils::ivec4 kernel_param_pad_dial = {
kernel_params.padding[0],
kernel_params.padding[1],
kernel_params.dilation[0],
kernel_params.dilation[1]};

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{},
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding},
{
graph.logical_limits_pc_of(out),
graph.sizes_pc_of(in),
PushConstantDataInfo(
&kernel_param_size_stride, sizeof(kernel_param_size_stride)),
PushConstantDataInfo(
&kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
PushConstantDataInfo(
&extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
PushConstantDataInfo(&out_params, sizeof(out_params)),
}));
} else {
graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{
t_out->logical_limits_ubo(),
t_in->sizes_ubo(),
graph.create_params_buffer(kernel_params),
graph.create_params_buffer(extra_params),
graph.create_params_buffer(out_params),
},
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding}));
}
}

void add_conv1d_node(
Expand Down
Loading