Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize.glslh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#ifndef QUANTIZE_GLSLH
#define QUANTIZE_GLSLH

OUT_T quantize_val(IN_T value, float scale_val, int zero_point_val) {
float inv_scale = 1.0 / scale_val;

float rounded_float = round(inv_scale * float(value));

int qvalue = zero_point_val + int(rounded_float);

qvalue = max(qvalue, quant_min);
qvalue = min(qvalue, quant_max);

return OUT_T(qvalue);
}

#endif // QUANTIZE_GLSLH
119 changes: 119 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define IN_T ${buffer_scalar_type(IN_DTYPE)}
#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}

#define ${MODE}

${define_active_storage_type("buffer")}
${define_required_extensions(IN_DTYPE)}
${define_required_extensions(OUT_DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}

$if MODE == "per_tensor":
layout(push_constant) uniform restrict Block {
float scale;
int zero_point;
int quant_min;
int quant_max;
};
$if MODE == "per_token":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}

layout(push_constant) uniform restrict Block {
int num_tokens;
int quant_min;
int quant_max;
};

${layout_declare_ubo(B, "int", "out_numel")}
${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
${layout_declare_ubo(B, "ivec4", "t_in_strides")}
${layout_declare_ubo(B, "ivec4", "t_out_sizes")}
${layout_declare_ubo(B, "ivec4", "t_out_strides")}

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}

#include "quantize.glslh"

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
const lowp ivec4 in_dim_order = unhash_dim_order(in_layout);

#ifdef per_tensor

void quantize_per_tensor() {
const int out_bufi = int(gl_GlobalInvocationID.x);

if (out_bufi >= out_numel) {
return;
}

const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);

IN_T value = t_in[in_bufi];
OUT_T qvalue = quantize_val(value, scale, zero_point);

t_out[out_bufi] = qvalue;
}

#else

void quantize_per_token() {
const int out_bufi = int(gl_GlobalInvocationID.x);

if (out_bufi >= out_numel) {
return;
}

const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);

IN_T value = t_in[in_bufi];

int token_idx = 0;

if (t_out_sizes.w > 1) {
// 4D tensor
token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y;
} else if (t_out_sizes.z > 1) {
// 3D tensor
token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y;
} else if (t_out_sizes.y > 1) {
// 2D tensor
token_idx = out_tidx.y;
}
// For 1D tensor, token_idx remains 0

token_idx = min(token_idx, num_tokens - 1);

OUT_T qvalue = quantize_val(value, t_scale[token_idx], t_zero_point[token_idx]);

t_out[out_bufi] = qvalue;
}

#endif

void main() {
quantize_${MODE}();
}
18 changes: 18 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
quantize_buffer:
parameter_names_with_default_values:
IN_DTYPE: float
OUT_DTYPE: int32
MODE: per_tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: half
- VALUE: float
OUT_DTYPE:
- VALUE: uint8
- VALUE: int8
- VALUE: int32
shader_variants:
- NAME: quantize_per_tensor_buffer
MODE: per_tensor
- NAME: quantize_per_token_buffer
MODE: per_token
120 changes: 120 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define IN_T ${buffer_scalar_type(IN_DTYPE)}
#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}

#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}

#define ${MODE}

${define_active_storage_type("texture3d")}
${define_required_extensions(IN_DTYPE)}
${define_required_extensions(OUT_DTYPE)}

#extension GL_EXT_control_flow_attributes : require

layout(std430) buffer;

${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}

$if MODE == "per_tensor":
layout(push_constant) uniform restrict Block {
float scale;
int zero_point;
int quant_min;
int quant_max;
};
$if MODE == "per_token":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}

layout(push_constant) uniform restrict Block {
int num_tokens;
int quant_min;
int quant_max;
};

${layout_declare_ubo(B, "ivec3", "t_in_limits")}
${layout_declare_ubo(B, "ivec3", "t_out_limits")}

#include "indexing_utils.h"
#include "quantize.glslh"

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

#ifdef per_tensor

void quantize_per_tensor() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, t_in_limits))) {
return;
}

FVEC4_T intex = load_texel(t_in, pos);
IVEC4_T outtex;

[[unroll]] for (int i = 0; i < 4; ++i) {
IN_T value = IN_T(intex[i]);
OUT_T qvalue = quantize_val(value, scale, zero_point);
outtex[i] = qvalue;
}
write_texel(t_out, pos, outtex);
}

#else

void quantize_per_token() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, t_in_limits))) {
return;
}

FVEC4_T intex = load_texel(t_in, pos);

int token_idx = 0;
ivec3 dims = t_in_limits;

if (dims.z > 1) {
// 3D tensor
token_idx = pos.z * dims.y + pos.y;
} else if (dims.y > 1) {
// 2D tensor
token_idx = pos.y;
}
// For 1D tensor, token_idx remains 0

token_idx = min(token_idx, num_tokens - 1);

// Scale and zero_point are prepacked as buffers, so direct access
float scale_val = t_scale[token_idx];
int zero_point_val = t_zero_point[token_idx];

IVEC4_T outtex;
[[unroll]] for (int i = 0; i < 4; ++i) {
IN_T value = IN_T(intex[i]);
OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
outtex[i] = qvalue;
}

write_texel(t_out, pos, outtex);
}

#endif

void main() {
quantize_${MODE}();
}
18 changes: 18 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
quantize_texture:
parameter_names_with_default_values:
IN_DTYPE: float
OUT_DTYPE: int32
MODE: per_tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: half
- VALUE: float
OUT_DTYPE:
- VALUE: uint8
- VALUE: int8
- VALUE: int32
shader_variants:
- NAME: quantize_per_tensor_texture3d
MODE: per_tensor
- NAME: quantize_per_token_texture3d
MODE: per_token
Loading
Loading