Skip to content

Commit c908bc2

Browse files
author
morelos
committed
[ET-VK][Ops] quantize_per_tensor.default test setup
Creating quantize_per_tensor testing framework along with a reference implementation for testing Differential Revision: [D75959065](https://our.internmc.facebook.com/intern/diff/D75959065/) [ghstack-poisoned]
1 parent e9a861a commit c908bc2

File tree

1 file changed

+252
-0
lines changed

1 file changed

+252
-0
lines changed

backends/vulkan/test/op_tests/quantize_test.cpp

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,56 @@ void check_quantize_args(
275275
" actual quant_max: ",
276276
quant_max);
277277
}
278+
279+
//
280+
// Reference Implementation
281+
//
282+
283+
/*
284+
* Reference implementation of quantize_per_tensor
285+
*/
286+
at::Tensor quantize_per_tensor_reference_impl(
287+
const at::Tensor& input,
288+
double scale,
289+
int64_t zero_point,
290+
int64_t quant_min,
291+
int64_t quant_max,
292+
at::ScalarType dtype) {
293+
// Create output tensor with the target dtype
294+
at::Tensor out = at::empty_like(input, dtype);
295+
296+
// Quantize the input tensor
297+
float inv_scale = 1.0 / scale;
298+
299+
// Iterate through the tensor and quantize each element
300+
at::Tensor float_input = input.to(at::kFloat);
301+
at::Tensor float_values = float_input.flatten();
302+
303+
auto out_flat = out.flatten();
304+
305+
for (int i = 0; i < float_values.numel(); i++) {
306+
float value = float_values[i].item<float>();
307+
int64_t qvalue = zero_point + std::nearbyint(inv_scale * value);
308+
309+
qvalue = std::max<int64_t>(qvalue, quant_min);
310+
qvalue = std::min<int64_t>(qvalue, quant_max);
311+
312+
if (dtype == at::kByte) {
313+
out_flat[i] = static_cast<uint8_t>(qvalue);
314+
} else if (dtype == at::kChar) {
315+
out_flat[i] = static_cast<int8_t>(qvalue);
316+
} else if (dtype == at::kShort) {
317+
out_flat[i] = static_cast<int16_t>(qvalue);
318+
} else if (dtype == at::kInt) {
319+
out_flat[i] = static_cast<int32_t>(qvalue);
320+
} else if (dtype == at::kLong) {
321+
out_flat[i] = static_cast<int64_t>(qvalue);
322+
}
323+
}
324+
325+
return out.reshape(input.sizes());
326+
}
327+
278328
/*
279329
* Reference implementation of quantize_per_token
280330
*/
@@ -337,6 +387,17 @@ at::Tensor quantize_per_token_reference_impl(
337387
return out;
338388
}
339389

390+
// Forward declaration of implementation functions
391+
void test_vulkan_quantize_per_tensor_impl(
392+
const std::vector<int>& input_sizes,
393+
float scale,
394+
int zero_point,
395+
int64_t quant_min,
396+
int64_t quant_max,
397+
at::ScalarType dtype,
398+
const vkcompute::utils::StorageType in_storage,
399+
const vkcompute::utils::StorageType out_storage);
400+
340401
void test_vulkan_quantize_per_token_impl(
341402
const std::vector<int>& input_sizes,
342403
const std::vector<float>& scales,
@@ -347,6 +408,37 @@ void test_vulkan_quantize_per_token_impl(
347408
const vkcompute::utils::StorageType in_storage,
348409
const vkcompute::utils::StorageType out_storage);
349410

411+
// Wrapper function to test both buffer and texture storage types
412+
void test_vulkan_quantize_per_tensor(
413+
const std::vector<int>& input_sizes,
414+
float scale,
415+
int zero_point,
416+
int64_t quant_min,
417+
int64_t quant_max,
418+
at::ScalarType dtype) {
419+
// Test with buffer storage
420+
test_vulkan_quantize_per_tensor_impl(
421+
input_sizes,
422+
scale,
423+
zero_point,
424+
quant_min,
425+
quant_max,
426+
dtype,
427+
vkcompute::utils::kBuffer,
428+
vkcompute::utils::kBuffer);
429+
430+
// Test with texture storage
431+
test_vulkan_quantize_per_tensor_impl(
432+
input_sizes,
433+
scale,
434+
zero_point,
435+
quant_min,
436+
quant_max,
437+
dtype,
438+
vkcompute::utils::kTexture3D,
439+
vkcompute::utils::kTexture3D);
440+
}
441+
350442
// Wrapper function to test both buffer and texture storage types
351443
void test_vulkan_quantize_per_token(
352444
const std::vector<int>& input_sizes,
@@ -378,6 +470,166 @@ void test_vulkan_quantize_per_token(
378470
vkcompute::utils::kTexture3D);
379471
}
380472

473+
void test_reference_quantize_per_tensor(
474+
const std::vector<int>& input_sizes,
475+
float scale,
476+
int zero_point,
477+
int64_t quant_min,
478+
int64_t quant_max,
479+
at::ScalarType dtype) {
480+
check_quantize_args(quant_min, quant_max, dtype);
481+
std::vector<int64_t> input_sizes_int64(
482+
input_sizes.begin(), input_sizes.end());
483+
at::Tensor input =
484+
at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
485+
486+
// Fill with a simple pattern: values from 0 to 1 in steps
487+
float step = 1.0f / (input.numel() - 1);
488+
auto flat_input = input.flatten();
489+
for (int i = 0; i < flat_input.numel(); i++) {
490+
flat_input[i] = i * step;
491+
}
492+
493+
// Reshape back to original dimensions
494+
input = flat_input.reshape(input_sizes_int64);
495+
496+
// Get reference output
497+
at::Tensor reference_out = quantize_per_tensor_reference_impl(
498+
input, scale, zero_point, quant_min, quant_max, dtype);
499+
500+
// Get implementation output
501+
at::Tensor impl_out = torch::executor::native::quantize_per_tensor_aten(
502+
input, scale, zero_point, quant_min, quant_max, dtype);
503+
504+
// Convert to int for consistent display regardless of underlying type
505+
at::Tensor reference_int = reference_out.to(at::kInt);
506+
at::Tensor impl_int = impl_out.to(at::kInt);
507+
508+
const bool output_correct = at::equal(reference_int, impl_int);
509+
if (!output_correct) {
510+
at::Tensor diffs = at::abs(reference_int - impl_int);
511+
512+
std::cout << "\n"
513+
<< "Failed with parameters: " << std::endl;
514+
std::cout << " scale: " << scale << std::endl;
515+
std::cout << " zero_point: " << zero_point << std::endl;
516+
std::cout << " quant_min: " << quant_min << std::endl;
517+
std::cout << " quant_max: " << quant_max << std::endl;
518+
519+
std::cout << "input:" << std::endl;
520+
std::cout << input << std::endl;
521+
std::cout << "reference:" << std::endl;
522+
std::cout << reference_int << std::endl;
523+
std::cout << "my_reference:" << std::endl;
524+
std::cout << impl_int << std::endl;
525+
}
526+
527+
ASSERT_TRUE(output_correct);
528+
}
529+
530+
void test_vulkan_quantize_per_tensor_impl(
531+
const std::vector<int>& input_sizes,
532+
float scale,
533+
int zero_point,
534+
int64_t quant_min,
535+
int64_t quant_max,
536+
at::ScalarType dtype,
537+
const vkcompute::utils::StorageType in_storage =
538+
vkcompute::utils::kTexture3D,
539+
const vkcompute::utils::StorageType out_storage =
540+
vkcompute::utils::kTexture3D) {
541+
check_quantize_args(quant_min, quant_max, dtype);
542+
std::vector<int64_t> input_sizes_int64(
543+
input_sizes.begin(), input_sizes.end());
544+
at::Tensor input =
545+
at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
546+
547+
// Get reference output
548+
at::Tensor reference_out = torch::executor::native::quantize_per_tensor_aten(
549+
input, scale, zero_point, quant_min, quant_max, dtype);
550+
551+
// Build Vulkan quantize_per_tensor graph
552+
using namespace vkcompute;
553+
554+
GraphConfig config;
555+
config.set_storage_type_override(in_storage);
556+
ComputeGraph graph(config);
557+
558+
IOValueRef r_input = graph.add_input_tensor(
559+
input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
560+
561+
const ValueRef r_scale = graph.add_scalar<double>(scale);
562+
const ValueRef r_zero_point = graph.add_scalar<int64_t>(zero_point);
563+
const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
564+
const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
565+
566+
const ValueRef r_out = graph.add_tensor(
567+
input.sizes().vec(), from_at_scalartype(dtype), out_storage);
568+
569+
VK_GET_OP_FN("quantize_per_tensor.default")
570+
(graph,
571+
{
572+
r_input.value,
573+
r_scale,
574+
r_zero_point,
575+
r_quant_min,
576+
r_quant_max,
577+
r_out,
578+
});
579+
580+
ValueRef staging_out = graph.set_output_tensor(r_out);
581+
582+
graph.prepare();
583+
graph.encode_prepack();
584+
graph.prepack();
585+
graph.encode_execute();
586+
587+
// Run Vulkan quantize_per_tensor
588+
graph.copy_into_staging(
589+
r_input.staging, input.const_data_ptr(), input.numel());
590+
591+
graph.execute();
592+
593+
at::Tensor vk_out = at::empty_like(reference_out).contiguous();
594+
graph.copy_from_staging(
595+
staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
596+
597+
// Compare outputs
598+
// For quantized types, we need to compare the actual integer values
599+
at::Tensor reference_int = reference_out.to(at::kInt);
600+
at::Tensor vk_int = vk_out.to(at::kInt);
601+
602+
const bool output_correct = at::equal(reference_int, vk_int);
603+
if (!output_correct) {
604+
at::Tensor diffs = at::abs(reference_int - vk_int);
605+
606+
std::cout << "\n"
607+
<< "Failed with parameters: " << std::endl;
608+
std::cout << " scale: " << scale << std::endl;
609+
std::cout << " zero_point: " << zero_point << std::endl;
610+
std::cout << " quant_min: " << quant_min << std::endl;
611+
std::cout << " quant_max: " << quant_max << std::endl;
612+
613+
std::cout << "input:" << std::endl;
614+
std::cout << input << std::endl;
615+
std::cout << "reference:" << std::endl;
616+
std::cout << reference_int << std::endl;
617+
std::cout << "vulkan:" << std::endl;
618+
std::cout << vk_int << std::endl;
619+
}
620+
621+
ASSERT_TRUE(output_correct);
622+
}
623+
624+
TEST(VulkanQuantizePerTensorTest, test_reference_quantize_per_tensor_int8) {
625+
test_reference_quantize_per_tensor(
626+
{2, 3, 4}, // input sizes
627+
0.1, // scale
628+
0, // zero_point
629+
-128, // quant_min
630+
127, // quant_max
631+
at::kChar);
632+
}
381633
void test_reference_quantize_per_token(
382634
const std::vector<int>& input_sizes,
383635
const std::vector<float>& scales,

0 commit comments

Comments
 (0)