Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/vulkan/_passes/fuse_quantized_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
continue

# Check for linear_qta8a_qga4w pattern (dynamic activation + grouped weight quantization)
qta8a_qga4w_details = matches_linear_qta8a_qga4w_pattern(self.program, node)
qta8a_qga4w_details = None
if qta8a_qga4w_details is not None:
group_size, weight_bits = qta8a_qga4w_details
fuse_into_linear_qta8a_qga4w_node(
Expand Down
26 changes: 15 additions & 11 deletions backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,22 @@
#define PRECISION ${PRECISION}

#define IN_T ${buffer_scalar_type(IN_DTYPE)}
#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)}
#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)}

#define ${MODE}

${define_active_storage_type("buffer")}
${define_required_extensions(IN_DTYPE)}
${define_required_extensions(SCALE_OUT_DTYPE)}
${define_required_extensions(ZP_OUT_DTYPE)}

#extension GL_EXT_control_flow_attributes : require

layout(std430) buffer;

${layout_declare_tensor(B, "w", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "w", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}

$if MODE == "per_tensor":
Expand Down Expand Up @@ -254,8 +258,8 @@ void choose_qparams_per_tensor() {
// Use default values: mapping_type=0 (ASYMMETRIC), eps from push constant
calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val);

t_scale[0] = scale_val;
t_zero_point[0] = zero_point_val;
t_scale[0] = SCALE_OUT_T(scale_val);
t_zero_point[0] = ZP_OUT_T(zero_point_val);
}
}

Expand Down Expand Up @@ -306,8 +310,8 @@ void choose_qparams_per_token() {
calc_scale_zp(lo, hi, quant_min, quant_max, 0, 1e-5, scale_val, zero_point_val);

// Write results
t_scale[token_id] = scale_val;
t_zero_point[token_id] = zero_point_val;
t_scale[token_id] = SCALE_OUT_T(scale_val);
t_zero_point[token_id] = ZP_OUT_T(zero_point_val);
}
}

Expand Down Expand Up @@ -380,12 +384,12 @@ void choose_qparams_block_wise() {
hi = 0.0;
}

float scale;
int zp;
calc_scale_zp(lo, hi, quant_min, quant_max, mapping_type, eps, scale, zp);
float scale_val;
int zero_point_val;
calc_scale_zp(lo, hi, quant_min, quant_max, mapping_type, eps, scale_val, zero_point_val);

t_zero_point[block_id] = zp;
t_scale[block_id] = scale;
t_scale[block_id] = SCALE_OUT_T(scale_val);
t_zero_point[block_id] = ZP_OUT_T(zero_point_val);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
choose_qparams_buffer:
parameter_names_with_default_values:
IN_DTYPE: float
SCALE_OUT_DTYPE: float
ZP_OUT_DTYPE: int32
MODE: per_tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: float
SCALE_OUT_DTYPE:
- VALUE: float
ZP_OUT_DTYPE:
- VALUE: int32
- VALUE: int8
- VALUE: float
shader_variants:
- NAME: choose_qparams_tensor_buffer
MODE: per_tensor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,26 @@

#define IN_T ${buffer_scalar_type(IN_DTYPE)}
#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)}
#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)}

#define ${MODE}

${define_active_storage_type("texture3d")}
${define_required_extensions(IN_DTYPE)}
${define_required_extensions(SCALE_OUT_DTYPE)}
${define_required_extensions(ZP_OUT_DTYPE)}

#extension GL_EXT_control_flow_attributes : require

layout(std430) buffer;

$if MODE != "block_wise":
${layout_declare_tensor(B, "w", "t_scale", "float", "texture3d")}
${layout_declare_tensor(B, "w", "t_zero_point", "int", "texture3d")}
${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "texture3d")}
${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "texture3d")}
$else:
${layout_declare_tensor(B, "w", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "w", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")}

${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}

Expand Down Expand Up @@ -273,8 +277,8 @@ void choose_qparams_per_tensor() {
int zero_point_val;
calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val);

write_texel(t_scale, ivec3(0, 0, 0), vec4(scale_val, 0.0, 0.0, 0.0));
write_texel(t_zero_point, ivec3(0, 0, 0), ivec4(zero_point_val, 0, 0, 0));
write_texel(t_scale, ivec3(0, 0, 0), vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0));
write_texel(t_zero_point, ivec3(0, 0, 0), ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0));
}
}

Expand Down Expand Up @@ -419,8 +423,8 @@ void choose_qparams_per_token() {
uint out_x = out_remainder % uint(t_scale_limits.x);
ivec3 out_pos = ivec3(int(out_x), int(out_y), int(out_z));

write_texel(t_scale, out_pos, vec4(scale_val, 0.0, 0.0, 0.0));
write_texel(t_zero_point, out_pos, ivec4(zero_point_val, 0, 0, 0));
write_texel(t_scale, out_pos, vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0));
write_texel(t_zero_point, out_pos, ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0));
}

// Synchronize before processing next token
Expand Down Expand Up @@ -517,8 +521,8 @@ void choose_qparams_block_wise() {
calc_scale_zp(vmin, vmax, quant_min, quant_max, mapping_type, eps, scale, zp);

// Write the scalar values directly to buffer using linear index
t_scale[blkIdx] = scale;
t_zero_point[blkIdx] = zp;
t_scale[blkIdx] = SCALE_OUT_T(scale);
t_zero_point[blkIdx] = ZP_OUT_T(zp);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
choose_qparams_texture:
parameter_names_with_default_values:
IN_DTYPE: float
SCALE_OUT_DTYPE: float
ZP_OUT_DTYPE: int32
MODE: per_tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: float
SCALE_OUT_DTYPE:
- VALUE: float
ZP_OUT_DTYPE:
- VALUE: int32
- VALUE: int8
- VALUE: float
shader_variants:
- NAME: choose_qparams_tensor_texture3d
MODE: per_tensor
Expand Down
28 changes: 16 additions & 12 deletions backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@

#define IN_T ${buffer_scalar_type(IN_DTYPE)}
#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)}
#define ZP_T ${buffer_scalar_type(ZP_DTYPE)}

#define ${MODE}

${define_active_storage_type("buffer")}
${define_required_extensions(IN_DTYPE)}
${define_required_extensions(OUT_DTYPE)}
${define_required_extensions(SCALE_DTYPE)}
${define_required_extensions(ZP_DTYPE)}

layout(std430) buffer;

Expand All @@ -27,25 +31,25 @@ ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}

$if MODE == "per_tensor":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}

layout(push_constant) uniform restrict Block {
int quant_min;
int quant_max;
};
$if MODE == "per_token":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}

layout(push_constant) uniform restrict Block {
int num_tokens;
int quant_min;
int quant_max;
};
$if MODE == "per_channel":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}

layout(push_constant) uniform restrict Block {
int axis;
Expand All @@ -54,8 +58,8 @@ $if MODE == "per_channel":
int quant_max;
};
$if MODE == "block_wise":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}

layout(push_constant) uniform restrict Block {
ivec4 blockSize; // bW, bH, bC, bN
Expand Down Expand Up @@ -150,7 +154,7 @@ void dequantize_per_tensor() {
const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);

IN_T qvalue = t_in[in_bufi];
OUT_T value = dequantize_val(qvalue, t_scale[0], t_zero_point[0]);
OUT_T value = dequantize_val(qvalue, float(t_scale[0]), int(t_zero_point[0]));

t_out[out_bufi] = value;
}
Expand Down Expand Up @@ -185,7 +189,7 @@ void dequantize_per_token() {

token_idx = min(token_idx, num_tokens - 1);

OUT_T value = dequantize_val(qvalue, t_scale[token_idx], t_zero_point[token_idx]);
OUT_T value = dequantize_val(qvalue, float(t_scale[token_idx]), int(t_zero_point[token_idx]));

t_out[out_bufi] = value;
}
Expand Down Expand Up @@ -224,7 +228,7 @@ void dequantize_per_channel() {

channel_idx = min(channel_idx, num_channels - 1);

OUT_T value = dequantize_val(qvalue, t_scale[channel_idx], t_zero_point[channel_idx]);
OUT_T value = dequantize_val(qvalue, float(t_scale[channel_idx]), int(t_zero_point[channel_idx]));

t_out[out_bufi] = value;
}
Expand All @@ -247,7 +251,7 @@ void dequantize_block_wise() {

const int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w;

const OUT_T value = dequantize_val(qvalue, t_scale[block_id], t_zero_point[block_id]);
const OUT_T value = dequantize_val(qvalue, float(t_scale[block_id]), int(t_zero_point[block_id]));

t_out[out_bufi] = value;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ dequantize_buffer:
parameter_names_with_default_values:
IN_DTYPE: int32
OUT_DTYPE: float
SCALE_DTYPE: float
ZP_DTYPE: int32
MODE: per_tensor
generate_variant_forall:
IN_DTYPE:
Expand All @@ -12,6 +14,12 @@ dequantize_buffer:
- VALUE: half
- VALUE: float
- VALUE: double
SCALE_DTYPE:
- VALUE: float
ZP_DTYPE:
- VALUE: int8
- VALUE: int32
- VALUE: float
shader_variants:
- NAME: dequantize_per_tensor_buffer
MODE: per_tensor
Expand Down
Loading
Loading