Skip to content

Commit 1785346

Browse files
pytorchbotssjia
andauthored
[ET-VK] Conv2d quantize/dequantize ops for conv2d activations (#14611)
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #14330 by @SS-JIA ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/SS-JIA/330/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/SS-JIA/330/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/SS-JIA/331/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/SS-JIA/330/orig Differential Revision: [D82542335](https://our.internmc.facebook.com/intern/diff/D82542335/) @diff-train-skip-merge Co-authored-by: ssjia <[email protected]>
1 parent 54f5ffe commit 1785346

13 files changed

+741
-13
lines changed

.github/workflows/pull.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ jobs:
10091009
./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
10101010
./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
10111011
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
1012+
./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations
10121013
10131014
# "Classic" Operator tests
10141015
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build

backends/vulkan/runtime/graph/ops/glsl/common.glslh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,30 @@ struct TensorIndex4D {
3333
ivec4 data;
3434
};
3535

36+
int sign_extend_8bit(const int val) {
37+
if ((val & 0x80) != 0) {
38+
return val | (~0xFF);
39+
}
40+
return val;
41+
}
42+
43+
int extract_8bit_from_packed_int_le(const int packed, const int i) {
44+
// account for little endian
45+
int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
46+
return byte;
47+
}
48+
49+
int pack_4xqint_into_int32(
50+
const int val0,
51+
const int val1,
52+
const int val2,
53+
const int val3) {
54+
int packed = (val0 & 0xFF) | ((val1 & 0xFF) << 8) | ((val2 & 0xFF) << 16) |
55+
((val3 & 0xFF) << 24);
56+
57+
return packed;
58+
}
59+
3660
#ifdef DEBUG_MODE
3761

3862
#extension GL_EXT_debug_printf : require

backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,48 @@ struct Conv2DParams {
2727
int K4;
2828
};
2929

30+
struct Conv2dTensorIndex {
31+
ivec3 data;
32+
int texel_i;
33+
};
34+
35+
struct Conv2dBlockIndex {
36+
ivec3 data;
37+
};
38+
39+
Conv2dTensorIndex block_idx_to_tensor_idx(const Conv2dBlockIndex block_idx) {
40+
Conv2dTensorIndex tensor_idx;
41+
tensor_idx.data.x = mul_4(block_idx.data.x);
42+
tensor_idx.data.y = block_idx.data.y;
43+
tensor_idx.data.z = block_idx.data.z;
44+
tensor_idx.texel_i = 0;
45+
return tensor_idx;
46+
}
47+
48+
struct Conv2dBlockExtents {
49+
ivec3 data;
50+
int data_xz;
51+
};
52+
53+
Conv2dBlockExtents make_block_extents(const ivec4 tensor_sizes) {
54+
Conv2dBlockExtents block_sizes;
55+
block_sizes.data.x = div_up_4(tensor_sizes.x);
56+
block_sizes.data.y = tensor_sizes.y;
57+
block_sizes.data.z = div_up_4(tensor_sizes.z);
58+
59+
block_sizes.data_xz = block_sizes.data.x * block_sizes.data.z;
60+
61+
return block_sizes;
62+
}
63+
64+
bool block_idx_out_of_bounds(
65+
const Conv2dBlockIndex block_idx,
66+
const Conv2dBlockExtents block_extents) {
67+
return block_idx.data.x >= block_extents.data.x ||
68+
block_idx.data.y >= block_extents.data.y ||
69+
block_idx.data.z >= block_extents.data.z;
70+
}
71+
3072
#ifdef DEBUG_MODE
3173

3274
void printConv2DParams(const Conv2DParams params) {
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#ifndef CONV2D_FP_INPUT_TILE_LOAD
10+
#define CONV2D_FP_INPUT_TILE_LOAD
11+
12+
#extension GL_EXT_control_flow_attributes : require
13+
14+
#include "linear_fp_input_tile.glslh"
15+
16+
VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
17+
return texelFetch(t_fp_input, tidx.data, 0);
18+
}
19+
20+
void load_fp_input_tile(
21+
out FPInputTile tile,
22+
const Conv2dBlockIndex block_idx) {
23+
#if TILE_M == 4 && TILE_K4 == 1
24+
Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
25+
[[unroll]] for (int w = 0; w < TILE_M; w++) {
26+
tile.data[w][0] = load_fp_input_texel(load_tidx);
27+
load_tidx.data.x++;
28+
}
29+
#else
30+
not_implemented;
31+
#endif
32+
}
33+
34+
#endif // CONV2D_FP_INPUT_TILE_LOAD

backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,6 @@
1616

1717
#include "common.glslh"
1818

19-
int sign_extend_8bit(const int val) {
20-
if ((val & 0x80) != 0) {
21-
return val | (~0xFF);
22-
}
23-
return val;
24-
}
25-
26-
int extract_8bit_from_packed_int_le(const int packed, const int i) {
27-
// account for little endian
28-
int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
29-
return byte;
30-
}
31-
3219
// Extract a 4-bit value from a packed int (little endian)
3320
// It is assumed that the 4-bit value is in the range [0, 15]
3421
int extract_4bit_from_packed_int_le(const int packed, const int col) {
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
13+
#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
14+
15+
// corresponds to the input width dim
16+
#define TILE_M4 1
17+
// corresponds to the input channels dim
18+
#define TILE_K4 1
19+
20+
#define TILE_M 4
21+
22+
$if OUTPUT_STORAGE == "buffer":
23+
#define OUTPUT_BUFFER
24+
$if INPUT_STORAGE == "buffer":
25+
#define INPUT_BUFFER
26+
27+
${define_required_extensions(DTYPE)}
28+
29+
layout(std430) buffer;
30+
31+
#include "conv2d_common.glslh"
32+
33+
${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
34+
${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
35+
36+
${layout_declare_ubo(B, "ivec4", "input_sizes")}
37+
38+
layout(push_constant) uniform restrict Block {
39+
float inv_scale;
40+
int zp;
41+
};
42+
43+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
44+
45+
#include "conv2d_fp_input_tile_load.glslh"
46+
#include "linear_int8_input_block.glslh"
47+
48+
void store_packed_int8_block(
49+
const Conv2dBlockIndex block_idx,
50+
const Conv2dBlockExtents block_extents,
51+
const Int8InputBlock packed_int8_block) {
52+
#ifdef OUTPUT_BUFFER
53+
const int buffer_idx = block_idx.data.y * block_extents.data_xz +
54+
block_idx.data.x * block_extents.data.z + block_idx.data.z;
55+
t_packed_int8_input[buffer_idx] = packed_int8_block.data;
56+
#else
57+
imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data);
58+
#endif
59+
}
60+
61+
void main() {
62+
Conv2dBlockIndex block_idx;
63+
block_idx.data = ivec3(gl_GlobalInvocationID);
64+
65+
Conv2dBlockExtents block_extents = make_block_extents(input_sizes);
66+
if (block_idx_out_of_bounds(block_idx, block_extents)) {
67+
return;
68+
}
69+
70+
FPInputTile fp_tile;
71+
load_fp_input_tile(fp_tile, block_idx);
72+
73+
Int8InputBlock int8_block;
74+
quantize_and_pack(int8_block, fp_tile, inv_scale, zp);
75+
76+
store_packed_int8_block(block_idx, block_extents, int8_block);
77+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
quantize_and_pack_q8ta_conv2d_input:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
OUTPUT_STORAGE: texture3d
11+
INPUT_STORAGE: texture3d
12+
generate_variant_forall:
13+
combination:
14+
parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
15+
combos:
16+
- parameter_values: [texture3d, texture3d]
17+
- parameter_values: [buffer, texture3d]
18+
DTYPE:
19+
- VALUE: float
20+
shader_variants:
21+
- NAME: quantize_and_pack_q8ta_conv2d_input
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
13+
#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
14+
15+
// corresponds to the output width dim
16+
#define TILE_M4 1
17+
// corresponds to the output channels dim
18+
#define TILE_K4 1
19+
20+
#define TILE_M 4
21+
22+
$if OUTPUT_STORAGE == "buffer":
23+
#define OUTPUT_BUFFER
24+
$if INPUT_STORAGE == "buffer":
25+
#define INPUT_BUFFER
26+
27+
${define_required_extensions(DTYPE)}
28+
29+
layout(std430) buffer;
30+
31+
#define DEBUG_MODE
32+
#include "conv2d_common.glslh"
33+
34+
${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
35+
${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)}
36+
37+
${layout_declare_ubo(B, "ivec4", "output_sizes")}
38+
39+
layout(push_constant) uniform restrict Block {
40+
float scale;
41+
int zp;
42+
};
43+
44+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
45+
46+
#include "linear_fp_input_tile.glslh"
47+
#include "linear_int8_input_tile.glslh"
48+
49+
void load_packed_int8_tile(
50+
out Int8InputTile int8_tile,
51+
const Conv2dBlockIndex block_idx,
52+
const Conv2dBlockExtents block_extents) {
53+
#ifdef INPUT_BUFFER
54+
const int buffer_idx = block_idx.data.y * block_extents.data_xz +
55+
block_idx.data.x * block_extents.data.z + block_idx.data.z;
56+
int8_tile.data[0][0] = t_packed_int8_output[buffer_idx];
57+
#else
58+
int8_tile.data[0][0] = texelFetch(t_packed_int8_output, block_idx.data, 0);
59+
#endif
60+
}
61+
62+
VEC4_T
63+
dequantize_8bit(const ivec4 val, const float q_scale, const int q_zero_point) {
64+
return VEC4_T(val - q_zero_point) * q_scale;
65+
}
66+
67+
void unpack_and_dequantize(
68+
out FPInputTile fp_tile,
69+
const Int8InputTile int8_tile,
70+
const float q_scale,
71+
const int q_zero_point) {
72+
[[unroll]] for (int w = 0; w < 4; ++w) {
73+
int packed = int8_tile.data[0][0][w];
74+
fp_tile.data[w][0] = dequantize_8bit(
75+
ivec4(
76+
extract_8bit_from_packed_int_le(packed, 0),
77+
extract_8bit_from_packed_int_le(packed, 1),
78+
extract_8bit_from_packed_int_le(packed, 2),
79+
extract_8bit_from_packed_int_le(packed, 3)),
80+
q_scale,
81+
q_zero_point);
82+
}
83+
}
84+
85+
void store_fp_output_texel(
86+
const Conv2dTensorIndex tidx,
87+
const VEC4_T out_texel) {
88+
imageStore(t_fp_output, tidx.data, out_texel);
89+
}
90+
91+
void store_fp_tile(
92+
const FPInputTile block,
93+
const Conv2dBlockIndex block_idx) {
94+
Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
95+
[[unroll]] for (int w = 0; w < 4; w++) {
96+
store_fp_output_texel(store_tidx, block.data[w][0]);
97+
store_tidx.data.x++;
98+
}
99+
}
100+
101+
void main() {
102+
Conv2dBlockIndex block_idx;
103+
block_idx.data = ivec3(gl_GlobalInvocationID);
104+
105+
Conv2dBlockExtents block_extents = make_block_extents(output_sizes);
106+
if (block_idx_out_of_bounds(block_idx, block_extents)) {
107+
return;
108+
}
109+
110+
Int8InputTile int8_tile;
111+
load_packed_int8_tile(int8_tile, block_idx, block_extents);
112+
113+
FPInputTile fp_tile;
114+
unpack_and_dequantize(
115+
fp_tile, int8_tile, scale, zp);
116+
117+
store_fp_tile(fp_tile, block_idx);
118+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
unpack_and_dequantize_q8ta_conv2d_output:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
OUTPUT_STORAGE: texture3d
11+
INPUT_STORAGE: texture3d
12+
generate_variant_forall:
13+
combination:
14+
parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
15+
combos:
16+
- parameter_values: [texture3d, texture3d]
17+
- parameter_values: [texture3d, buffer]
18+
DTYPE:
19+
- VALUE: float
20+
shader_variants:
21+
- NAME: unpack_and_dequantize_q8ta_conv2d_output

0 commit comments

Comments
 (0)