diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl index ea0c2f7dce7..c3e58286efe 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl @@ -42,6 +42,16 @@ $if MODE == "per_token": int quant_min; int quant_max; }; +$if MODE == "per_channel": + ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")} + ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")} + + layout(push_constant) uniform restrict Block { + int axis; + int num_channels; + int quant_min; + int quant_max; + }; ${layout_declare_ubo(B, "int", "out_numel")} ${layout_declare_ubo(B, "ivec4", "t_in_sizes")} @@ -137,7 +147,7 @@ void quantize_per_tensor() { t_out[out_bufi] = qvalue; } -#else +#elif defined(per_token) void quantize_per_token() { const int out_bufi = int(gl_GlobalInvocationID.x); @@ -172,6 +182,45 @@ void quantize_per_token() { t_out[out_bufi] = qvalue; } +#else // per_channel + +void quantize_per_channel() { + const int out_bufi = int(gl_GlobalInvocationID.x); + + if (out_bufi >= out_numel) { + return; + } + + const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); + const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); + + IN_T value = t_in[in_bufi]; + + // Calculate channel index based on the quantization axis (already converted to WHCN) + // The axis parameter is now in WHCN coordinate system: + // axis 0 -> W dimension (tidx.x) + // axis 1 -> H dimension (tidx.y) + // axis 2 -> C dimension (tidx.z) + // axis 3 -> N dimension (tidx.w) + int channel_idx = 0; + + if (axis == 0) { + channel_idx = out_tidx.x; + } else if (axis == 1) { + channel_idx = out_tidx.y; + } else if (axis == 2) { + channel_idx = out_tidx.z; + } else if (axis == 3) { + channel_idx = out_tidx.w; + } + + channel_idx = min(channel_idx, num_channels - 1); + + OUT_T qvalue = quantize_val(value, t_scale[channel_idx], t_zero_point[channel_idx]); + + t_out[out_bufi] = qvalue; +} + #endif void main() { diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml index 4d95d610314..1dd8e6e2ffe 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml @@ -17,3 +17,5 @@ quantize_buffer: MODE: per_tensor - NAME: quantize_per_token_buffer MODE: per_token + - NAME: quantize_per_channel_buffer + MODE: per_channel diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl index 9ba7074f75b..bdaba3ffaf9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl @@ -26,6 +26,8 @@ ${define_required_extensions(OUT_DTYPE)} layout(std430) buffer; +#include "indexing_utils.h" + ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")} ${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")} @@ -45,11 +47,23 @@ $if MODE == "per_token": int quant_min; int quant_max; }; +$if MODE == "per_channel": + ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")} + ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")} + + layout(push_constant) uniform restrict Block { + int axis; + int num_channels; + int quant_min; + int quant_max; + }; ${layout_declare_ubo(B, "ivec3", "t_in_limits")} ${layout_declare_ubo(B, "ivec3", "t_out_limits")} -#include "indexing_utils.h" +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + #include "quantize.glslh" layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -138,7 +152,7 @@ void quantize_per_tensor() { write_texel(t_out, pos, outtex); } -#else +#elif defined(per_token) void quantize_per_token() { const ivec3 pos = ivec3(gl_GlobalInvocationID); @@ -177,6 +191,84 @@ void quantize_per_token() { write_texel(t_out, pos, outtex); } +#else // per_channel + +void quantize_per_channel() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, t_in_limits))) { + return; + } + + FVEC4_T intex = load_texel(t_in, pos); + IVEC4_T outtex; + + // Calculate channel index based on the quantization axis (already converted to WHCN) + // The axis parameter is now in WHCN coordinate system: + // axis 0 -> W dimension (pos.x for texture, but width-packed so pos.x * 4 + component) + // axis 1 -> H dimension (pos.y) + // axis 2 -> C dimension (pos.z / C), but for 4D tensors this includes batch-channel folding + // axis 3 -> N dimension (pos.z / N), but for 4D tensors this includes batch-channel folding + + if (axis == 0) { + // Width dimension - each texel component has different channel index + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + int channel_idx = pos.x * 4 + i; + channel_idx = min(channel_idx, num_channels - 1); + + float scale_val = t_scale[channel_idx]; + int zero_point_val = t_zero_point[channel_idx]; + OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); + outtex[i] = qvalue; + } + } else if (axis == 1) { + // Height dimension - all texel components use same channel index + int channel_idx = pos.y; + channel_idx = min(channel_idx, num_channels - 1); + float scale_val = t_scale[channel_idx]; + int zero_point_val = t_zero_point[channel_idx]; + + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); + outtex[i] = qvalue; + } + } else if (axis == 2) { + // Channel dimension - for 4D tensors, need to account for batch-channel folding + // The Z coordinate contains folded batch*channel information + // We need to extract the actual channel index from the folded dimension + int folded_idx = pos.z; + int channel_idx = folded_idx % num_channels; + + float scale_val = t_scale[channel_idx]; + int zero_point_val = t_zero_point[channel_idx]; + + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); + outtex[i] = qvalue; + } + } else if (axis == 3) { + // Batch dimension - for 4D tensors, need to account for batch-channel folding + // The Z coordinate contains folded batch*channel information + // We need to extract the actual batch index from the folded dimension + int folded_idx = pos.z; + int batch_idx = folded_idx / num_channels; + + float scale_val = t_scale[batch_idx]; + int zero_point_val = t_zero_point[batch_idx]; + + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); + outtex[i] = qvalue; + } + } + + write_texel(t_out, pos, outtex); +} + #endif void main() { diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml index 65002ce26b6..47e532be8b9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml @@ -17,3 +17,5 @@ quantize_texture: MODE: per_tensor - NAME: quantize_per_token_texture3d MODE: per_token + - NAME: quantize_per_channel_texture3d + MODE: per_channel diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp index f8f930bf0fb..74dee249b0a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp @@ -12,11 +12,10 @@ #include #include -#include -namespace vkcompute { +#include -namespace { +namespace vkcompute { void resize_quantize_output( ComputeGraph* graph, @@ -28,7 +27,52 @@ void resize_quantize_output( graph->virtual_resize(out, graph->sizes_of(in)); } -} // namespace +utils::uvec3 quantize_per_channel_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + utils::uvec3 global_wg_size = graph->create_global_wg_size(out); + + return global_wg_size; +} + +utils::uvec3 quantize_per_channel_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)args; + (void)resize_args; + + const ValueRef input = args.at(1).refs.at(0); + + utils::uvec3 local_wg_size = + graph->create_local_wg_size(global_workgroup_size); + + // WORKAROUND: The CommandBuffer::dispatch function divides + // global_workgroup_size by local_workgroup_size to get the number of + // workgroups to dispatch. For per-channel quantization along the batch axis, + // we need to ensure that we dispatch the correct number of workgroups in the + // Z dimension to cover all batch-channel combinations. + // + // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], + // local_wg_size[2]) might reduce the number of workgroups dispatched. To + // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension, + // we set local_wg_size[2] = 1. + const auto input_sizes = graph->sizes_of(input); + if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) { + local_wg_size[2] = 1; + } + + return local_wg_size; +} void add_quantize_per_tensor_node( ComputeGraph& graph, @@ -171,6 +215,99 @@ void add_quantize_per_token_node( resize_quantize_output)); } +void add_quantize_per_channel_node( + ComputeGraph& graph, + const ValueRef& input, + const ValueRef& scale, + const ValueRef& zero_point, + const ValueRef& axis, + const ValueRef& quant_min, + const ValueRef& quant_max, + const ValueRef& output) { + std::string kernel_name("quantize_per_channel"); + add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(output)); + + int axis_val = static_cast(graph.get_int(axis)); + int quant_min_val = static_cast(graph.get_int(quant_min)); + int quant_max_val = static_cast(graph.get_int(quant_max)); + + // Normalize axis and convert from NCHW to WHCN using utility functions + const auto input_sizes = graph.sizes_of(input); + const int64_t ndim = graph.dim_of(input); + + // Normalize axis to handle negative indices + axis_val = normalize(axis_val, ndim); + + // Convert from NCHW axis to WHCN axis for shader (vulkan representation) + int axis_whcn = nchw_dim_to_whcn_dim(axis_val, ndim); + + int num_channels; + if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) { + // For batch dimension quantization in 4D tensors, pass the actual number of + // channels so the shader can correctly unfold the batch-channel folding + num_channels = static_cast(input_sizes[1]); // Channel dimension + } else { + num_channels = static_cast(input_sizes[axis_val]); + } + + vkapi::ParamsBindList param_ubos; + std::vector push_constants; + + if (graph.is_buffer_storage(input)) { + param_ubos = { + graph.numel_ubo(input), + graph.sizes_ubo(input), + graph.strides_ubo(input), + graph.sizes_ubo(output), + graph.strides_ubo(output), + }; + push_constants = { + PushConstantDataInfo(&axis_whcn, sizeof(int)), + PushConstantDataInfo(&num_channels, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } else { + param_ubos = { + graph.logical_limits_ubo(input), + graph.logical_limits_ubo(output), + }; + push_constants = { + PushConstantDataInfo(&axis_whcn, sizeof(int)), + PushConstantDataInfo(&num_channels, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } + + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(output), + graph.hashed_layout_of(input), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + quantize_per_channel_global_wg_size, + quantize_per_channel_local_wg_size, + // Inputs and Outputs + {{output, vkapi::kWrite}, + {input, vkapi::kRead}, + {{scale, zero_point}, vkapi::kRead}}, + // Shader param buffers + param_ubos, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize Args + {}, + // Resizing Logic + resize_quantize_output)); +} + void quantize_per_tensor_impl( ComputeGraph& graph, const std::vector& args) { @@ -272,12 +409,93 @@ void quantize_per_token_impl( graph, input, scale, zero_point, quant_min, quant_max, output); } +void quantize_per_channel_impl( + ComputeGraph& graph, + const std::vector& args) { + int arg_idx = 0; + const ValueRef input = args[arg_idx++]; + const ValueRef scale = args[arg_idx++]; + const ValueRef zero_point = args[arg_idx++]; + const ValueRef axis = args[arg_idx++]; + const ValueRef quant_min = args[arg_idx++]; + const ValueRef quant_max = args[arg_idx++]; + const ValueRef dtype = args[arg_idx++]; // Added dtype parameter + const ValueRef output = args[arg_idx++]; + + // Suppress unused variable warning - dtype is inferred from output + (void)dtype; + + // Check tensor types + VK_CHECK_COND(graph.val_is_tensor(input)); + VK_CHECK_COND(graph.val_is_tensor(scale)); + VK_CHECK_COND(graph.val_is_tensor(zero_point)); + VK_CHECK_COND(graph.val_is_tensor(output)); + + // Verify input is a floating point type + VK_CHECK_COND( + graph.dtype_of(input) == vkapi::kDouble || + graph.dtype_of(input) == vkapi::kFloat || + graph.dtype_of(input) == vkapi::kHalf); + + // Check that scale and zero_point have buffer storage and width packing + VK_CHECK_COND(graph.is_buffer_storage(scale)); + VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); + VK_CHECK_COND(graph.is_buffer_storage(zero_point)); + VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); + + // Check that tensors with texture storage have standard axis map + if (!graph.is_buffer_storage(input)) { + VK_CHECK_COND(graph.has_standard_axis_map(input)); + } + if (!graph.is_buffer_storage(output)) { + VK_CHECK_COND(graph.has_standard_axis_map(output)); + } + + // Normalize axis + int axis_val = static_cast(graph.get_int(axis)); + const auto input_sizes = graph.sizes_of(input); + int64_t ndim = graph.dim_of(input); + if (axis_val < 0) { + axis_val += ndim; + } + + // Verify axis is valid + VK_CHECK_COND(axis_val >= 0 && axis_val < ndim); + + // Get number of channels along the specified axis + int64_t num_channels = input_sizes[axis_val]; + + const auto scale_sizes = graph.sizes_of(scale); + const auto zero_point_sizes = graph.sizes_of(zero_point); + + // Calculate total number of elements in scale and zero_point tensors + int64_t scale_numel = 1; + for (size_t i = 0; i < scale_sizes.size(); i++) { + scale_numel *= scale_sizes[i]; + } + + int64_t zero_point_numel = 1; + for (size_t i = 0; i < zero_point_sizes.size(); i++) { + zero_point_numel *= zero_point_sizes[i]; + } + + // Check that the total number of elements matches num_channels + VK_CHECK_COND(scale_numel == num_channels); + VK_CHECK_COND(zero_point_numel == num_channels); + + add_quantize_per_channel_node( + graph, input, scale, zero_point, axis, quant_min, quant_max, output); +} + REGISTER_OPERATORS { VK_REGISTER_OP( quantized_decomposed.quantize_per_tensor.default, quantize_per_tensor_impl); VK_REGISTER_OP( quantized_decomposed.quantize_per_token.default, quantize_per_token_impl); + VK_REGISTER_OP( + quantized_decomposed.quantize_per_channel.default, + quantize_per_channel_impl); } } // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp index 8c5246f6c0c..ebb12bc1b3a 100644 --- a/backends/vulkan/test/op_tests/quantize_test.cpp +++ b/backends/vulkan/test/op_tests/quantize_test.cpp @@ -473,6 +473,18 @@ void test_vulkan_quantize_per_token_impl( const vkcompute::utils::StorageType in_storage, const vkcompute::utils::StorageType out_storage); +void test_vulkan_quantize_per_channel_impl( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype, + at::ScalarType dtype, + const vkcompute::utils::StorageType in_storage, + const vkcompute::utils::StorageType out_storage); + // Wrapper function to test both buffer and texture storage types void test_vulkan_quantize_per_tensor( const std::vector& input_sizes, @@ -553,6 +565,48 @@ void test_vulkan_quantize_per_token( vkcompute::utils::kTexture3D); } +// Wrapper function to test both buffer and texture storage types +void test_vulkan_quantize_per_channel( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype = at::kFloat, + at::ScalarType dtype = at::kInt) { + // Test with buffer storage + test_vulkan_quantize_per_channel_impl( + input_sizes, + scales, + zero_points, + axis, + quant_min, + quant_max, + in_dtype, + dtype, + vkcompute::utils::kBuffer, + vkcompute::utils::kBuffer); + + // If the in_dtype is a double, convert to float for texture implementation + // since they don't support 64bit as inputs + if (in_dtype == at::kDouble) { + in_dtype = at::kFloat; + } + + test_vulkan_quantize_per_channel_impl( + input_sizes, + scales, + zero_points, + axis, + quant_min, + quant_max, + in_dtype, + dtype, + vkcompute::utils::kTexture3D, + vkcompute::utils::kTexture3D); +} + void test_reference_quantize_per_tensor( const std::vector& input_sizes, float scale, @@ -1436,6 +1490,167 @@ void test_reference_quantize_per_channel( ASSERT_TRUE(output_correct); } +void test_vulkan_quantize_per_channel_impl( + const std::vector& input_sizes, + const std::vector& pre_scales, + const std::vector& zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype = at::kFloat, + at::ScalarType dtype = at::kInt, + const vkcompute::utils::StorageType in_storage = + vkcompute::utils::kTexture3D, + const vkcompute::utils::StorageType out_storage = + vkcompute::utils::kTexture3D) { + check_quantize_args(quant_min, quant_max, dtype); + check_quantize_per_channel_args(input_sizes, pre_scales, zero_points, axis); + + std::vector scales = pre_scales; + for (auto& s : scales) { + s = s < eps ? eps : s; + } + + // Create input tensor with random values + std::vector input_sizes_int64( + input_sizes.begin(), input_sizes.end()); + at::Tensor input = + at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); + at::Tensor scale_tensor = + at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); + at::Tensor zero_point_tensor = + at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); + + // Get reference output + at::Tensor reference_out = torch::executor::native::quantize_per_channel_aten( + input, + scale_tensor, + zero_point_tensor, + axis, + quant_min, + quant_max, + dtype); + + using namespace vkcompute; + + GraphConfig config; + config.set_storage_type_override(in_storage); + ComputeGraph graph(config); + + IOValueRef r_input = graph.add_input_tensor( + input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); + IOValueRef r_scale = graph.add_input_tensor( + scale_tensor.sizes().vec(), + vkapi::kFloat, + utils::kBuffer, + utils::kWidthPacked); + IOValueRef r_zero_point = graph.add_input_tensor( + zero_point_tensor.sizes().vec(), + vkapi::kInt, + utils::kBuffer, + utils::kWidthPacked); + + const ValueRef r_axis = graph.add_scalar(axis); + const ValueRef r_quant_min = graph.add_scalar(quant_min); + const ValueRef r_quant_max = graph.add_scalar(quant_max); + + const ValueRef r_out = graph.add_tensor( + input.sizes().vec(), from_at_scalartype(dtype), out_storage); + + const ValueRef r_dtype = + graph.add_scalar(static_cast(dtype)); + + VK_GET_OP_FN("quantized_decomposed.quantize_per_channel.default") + (graph, + { + r_input.value, + r_scale.value, + r_zero_point.value, + r_axis, + r_quant_min, + r_quant_max, + r_dtype, + r_out, + }); + + ValueRef staging_out = graph.set_output_tensor(r_out); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + // Copy input data to GPU + graph.copy_into_staging( + r_input.staging, input.const_data_ptr(), input.numel()); + + // Convert scale tensor to float and copy to GPU + at::Tensor scale_float = scale_tensor.to(at::kFloat); + graph.copy_into_staging( + r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); + + // Convert zero_point tensor to int and copy to GPU + at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); + graph.copy_into_staging( + r_zero_point.staging, + zero_point_int.const_data_ptr(), + zero_point_int.numel()); + + // Execute the graph + graph.execute(); + + // Copy output data back to CPU + at::Tensor vk_out = at::empty_like(reference_out).contiguous(); + graph.copy_from_staging( + staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); + + // Compare outputs + at::Tensor reference_int = reference_out.to(at::kInt); + at::Tensor vk_int = vk_out.to(at::kInt); + + // Tolerance is 1 to address rounding errors and fp math differences between + // CPU/GPU + const bool output_correct = + at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1); + if (!output_correct) { + at::Tensor diffs = at::abs(reference_int - vk_int); + + std::cout << "\n" + << "Failed with parameters: " << std::endl; + std::cout << " axis: " << axis << std::endl; + std::cout << " input sizes:"; + for (size_t i = 0; i < input_sizes.size(); i++) { + std::cout << " " << input_sizes[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " scale(s):"; + for (size_t i = 0; i < scales.size(); i++) { + std::cout << " " << scales[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " zero_point(s):"; + for (size_t i = 0; i < zero_points.size(); i++) { + std::cout << " " << zero_points[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " quant_min: " << quant_min << std::endl; + std::cout << " quant_max: " << quant_max << std::endl; + std::cout << " storage type: " + << (in_storage == vkcompute::utils::kBuffer ? "buffer" + : "texture") + << std::endl; + + std::cout << "input:" << std::endl; + std::cout << input << std::endl; + std::cout << "reference:" << std::endl; + std::cout << reference_int << std::endl; + std::cout << "vulkan:" << std::endl; + std::cout << vk_int << std::endl; + } + + ASSERT_TRUE(output_correct); +} + TEST( VulkanQuantizePerChannelTest, test_reference_quantize_per_channel_float_to_int8_3D_axis0) { @@ -1503,3 +1718,413 @@ TEST( at::kFloat, at::kChar); } + +// END OF REFERENCE TESTS + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_float_to_int8_axis0) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales(9, 0.1f); + std::vector zero_points(9, 2); + + // 1D Tensor + test_vulkan_quantize_per_channel( + {9}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 2D Tensor + test_vulkan_quantize_per_channel( + {9, 14}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 3D Tensor + test_vulkan_quantize_per_channel( + {9, 7, 11}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 17, 5, 5}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {5, 17, 5, 9}, // input sizes + scales, + zero_points, + -1, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_float_to_int8_axis1) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales(14, 0.001f); + std::vector zero_points(14, -5); + + // 2D Tensor + test_vulkan_quantize_per_channel( + {9, 14}, // input sizes + scales, + zero_points, + 1, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 3D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11}, // input sizes + scales, + zero_points, + 1, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 5, 5}, // input sizes + scales, + zero_points, + 1, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {9, 7, 14, 5}, // input sizes + scales, + zero_points, + -2, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_float_to_int8_axis2) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales(11, 0.5f); + std::vector zero_points(11, 12); + + // 3D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11}, // input sizes + scales, + zero_points, + 2, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11, 5}, // input sizes + scales, + zero_points, + 2, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {9, 11, 14, 5}, // input sizes + scales, + zero_points, + -3, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_float_to_int8_axis3) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales(7, 0.5f); + std::vector zero_points(7, 12); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11, 7}, // input sizes + scales, + zero_points, + 3, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {7, 14, 11, 7}, // input sizes + scales, + zero_points, + -4, // axis + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = {0.1, 0.2, 0.0001, 0.5, 0.02}; + std::vector zero_points = {0, 5, -5, 1, 12}; + + // 4D Tensor + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + 0, // axis + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 5, 11, 7}, // input sizes + scales, + zero_points, + 1, // axis + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 5, 7}, // input sizes + scales, + zero_points, + 2, // axis + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11, 5}, // input sizes + scales, + zero_points, + 3, // axis + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + -4, // axis + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_half_to_8bit) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_float16_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; + std::vector zero_points = {0, 5, 5, 1, 12}; + + // 4D Tensor + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kHalf, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 5, 11, 7}, // input sizes + scales, + zero_points, + 1, // axis + -128, // quant_min + 127, // quant_max + at::kHalf, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 5, 7}, // input sizes + scales, + zero_points, + 2, // axis + 0, // quant_min + 255, // quant_max + at::kHalf, + at::kByte); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11, 5}, // input sizes + scales, + zero_points, + 3, // axis + -128, // quant_min + 127, // quant_max + at::kHalf, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + -4, // axis + 0, // quant_min + 255, // quant_max + at::kHalf, + at::kByte); +} + +TEST( + VulkanQuantizePerChannelTest, + test_vulkan_quantize_per_channel_double_to_8bit) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; + std::vector zero_points = {0, 5, 5, 1, 12}; + + // 4D Tensor + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + 0, // axis + -128, // quant_min + 127, // quant_max + at::kDouble, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 5, 11, 7}, // input sizes + scales, + zero_points, + 1, // axis + -128, // quant_min + 127, // quant_max + at::kDouble, + at::kChar); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 5, 7}, // input sizes + scales, + zero_points, + 2, // axis + 0, // quant_min + 255, // quant_max + at::kDouble, + at::kByte); + + // 4D Tensor + test_vulkan_quantize_per_channel( + {9, 14, 11, 5}, // input sizes + scales, + zero_points, + 3, // axis + -128, // quant_min + 127, // quant_max + at::kDouble, + at::kChar); + + // 4D Tensor (negative axis) + test_vulkan_quantize_per_channel( + {5, 14, 11, 7}, // input sizes + scales, + zero_points, + -4, // axis + 0, // quant_min + 255, // quant_max + at::kDouble, + at::kByte); +}