Skip to content

Commit c43ffb9

Browse files
pytorchbothinriksnaer
authored andcommitted
[ET-VK] New Implementation of `permute' operator (pytorch#11971)
## Changes * Introduce `permute_buffer.glsl` and `permute_texture.glsl` compute shader templates to implement the permute operator ## Motivation The existing implementation of permute produced incorrect outputs for width packed textures. Furthermore, there was no buffer implementation for the permute operator. My goal with this diff is to introduce a more flexible implementation of permute that could work for any tensor representation. ## Performance impact None expected. Differential Revision: [D76483755](https://our.internmc.facebook.com/intern/diff/D76483755/)
1 parent 25d460b commit c43ffb9

File tree

8 files changed

+253
-124
lines changed

8 files changed

+253
-124
lines changed

backends/vulkan/runtime/graph/ops/glsl/permute.glsl

Lines changed: 0 additions & 89 deletions
This file was deleted.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
#define T ${buffer_scalar_type(DTYPE)}
15+
16+
${define_active_storage_type("buffer")}
17+
${define_required_extensions(DTYPE)}
18+
19+
layout(std430) buffer;
20+
21+
#include "indexing_utils.h"
22+
23+
${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
24+
${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
25+
26+
${layout_declare_ubo(B, "ivec4", "in_sizes")}
27+
${layout_declare_ubo(B, "ivec4", "out_strides")}
28+
${layout_declare_ubo(B, "int", "out_numel")}
29+
30+
layout(push_constant) uniform restrict Block {
31+
ivec4 in_strides;
32+
ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
33+
};
34+
35+
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
36+
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
37+
38+
const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
39+
40+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
41+
42+
// Convert output tensor index to input tensor index based on permutation
43+
ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
44+
ivec4 in_tidx;
45+
46+
// Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
47+
in_tidx[permute_dims.x] = out_tidx.x;
48+
in_tidx[permute_dims.y] = out_tidx.y;
49+
in_tidx[permute_dims.z] = out_tidx.z;
50+
in_tidx[permute_dims.w] = out_tidx.w;
51+
52+
return in_tidx;
53+
}
54+
55+
void main() {
56+
const int out_bufi = ivec3(gl_GlobalInvocationID).x;
57+
if (out_bufi >= out_numel) {
58+
return;
59+
}
60+
61+
// Convert buffer index to tensor index for output
62+
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
63+
64+
// Convert output tensor index to input tensor index using permutation
65+
const ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
66+
67+
// Convert input tensor index back to buffer index
68+
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
69+
70+
// Copy data from input to output
71+
t_out[out_bufi] = t_in[in_bufi];
72+
}
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
permute:
1+
permute_buffer:
22
parameter_names_with_default_values:
33
DTYPE: float
4-
NDIM: 3
5-
STORAGE: texture3d
64
generate_variant_forall:
75
DTYPE:
86
- VALUE: half
97
- VALUE: float
108
- VALUE: int32
119
shader_variants:
12-
- NAME: permute
10+
- NAME: permute_buffer
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
#define T ${buffer_scalar_type(DTYPE)}
15+
16+
${define_active_storage_type("texture3d")}
17+
${define_required_extensions(DTYPE)}
18+
19+
layout(std430) buffer;
20+
21+
#include "indexing_utils.h"
22+
23+
${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
24+
${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
25+
26+
layout(push_constant) uniform restrict Block {
27+
ivec4 out_sizes;
28+
ivec4 in_sizes;
29+
ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
30+
};
31+
32+
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
33+
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
34+
const lowp int out_packed_dim = unhash_packed_dim(out_layout);
35+
36+
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
37+
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
38+
const lowp int in_packed_dim = unhash_packed_dim(in_layout);
39+
40+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
41+
42+
// Convert output tensor index to input tensor index based on permutation
43+
ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
44+
ivec4 in_tidx;
45+
46+
// Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
47+
in_tidx[permute_dims.x] = out_tidx.x;
48+
in_tidx[permute_dims.y] = out_tidx.y;
49+
in_tidx[permute_dims.z] = out_tidx.z;
50+
in_tidx[permute_dims.w] = out_tidx.w;
51+
52+
return in_tidx;
53+
}
54+
55+
// Check if we can use the fast path where texels from the input tensor can be
56+
// copied directly into the output tensor. This occurs when the packed dimension
57+
// is preserved in the permutation, i.e. reading a texel from the output tensor
58+
// produces 4 texels along the same dimension as reading a texel from the input
59+
// tensor.
60+
bool can_use_fast_path() {
61+
// Fast path is possible when the packed dimension is preserved in the permutation
62+
// This means permute_dims[out_packed_dim] == in_packed_dim
63+
return permute_dims[out_packed_dim] == in_packed_dim;
64+
}
65+
66+
void main() {
67+
const ivec3 lpos = ivec3(gl_GlobalInvocationID);
68+
ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
69+
70+
if (any(greaterThanEqual(out_tidx, out_sizes))) {
71+
return;
72+
}
73+
74+
if (can_use_fast_path()) {
75+
// Fast path: packed dimension is preserved, so we can copy texels directly
76+
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
77+
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
78+
VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
79+
80+
write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
81+
}
82+
else {
83+
// Slow path: packed dimension is not preserved, so each element of the
84+
// output texel may be "sourced" from a different texel in the input tensor.
85+
// Therefore each output texel element is processed individually.
86+
VEC4_T out_texel = VEC4_T(0);
87+
88+
for (int texel_i = 0; texel_i < 4; ++texel_i) {
89+
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
90+
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
91+
int element_idx = in_tidx[in_packed_dim] % 4;
92+
93+
VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
94+
T selected_value = T(in_texel[element_idx]);
95+
96+
out_texel[texel_i] = selected_value;
97+
98+
out_tidx[out_packed_dim]++;
99+
}
100+
101+
write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
102+
}
103+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
permute_texture:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
generate_variant_forall:
5+
DTYPE:
6+
- VALUE: half
7+
- VALUE: float
8+
- VALUE: int32
9+
shader_variants:
10+
- NAME: permute_texture3d

0 commit comments

Comments
 (0)