[ET-VK][ez] Address regressed conv2d perf numbers on main (#16084)

pytorchbot · web-flow · commit 18c1c5b515c4 · 2025-12-04T15:01:56.000-05:00
Address the benchmark binaries reporting worse performance than one month ago. The regression was not a "real" regression but due to some changes in the benchmark binaries that were made during debugging but were not reverted during landing: 1. Only running 1 benchmark iteration without any warmup iterations 2. The quantize/dequantize shaders would normally be excluded for the overall execution time / FLOPS calculation, but the name of these shaders was recently changed and the logic that filtered these shaders when reporting time was not updated accordingly. Also includes a small fix to the input data loading logic. Differential Revision: [D88381899](https://our.internmc.facebook.com/intern/diff/D88381899/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
@@ -39,6 +39,8 @@ void load_fp_input_tile(
   [[unroll]] for (int w = 0; w < TILE_M; w++) {
     if (load_tidx.data.x < input_sizes.x) {
       tile.data[w][0] = load_fp_input_texel(load_tidx);
+    } else {
+      tile.data[w][0] = VEC4_T(0);
     }
     load_tidx.data.x++;
   }
diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
@@ -552,7 +552,7 @@ int main(int argc, char* argv[]) {
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,
       "QuantizedLinearQ4GSW",
-      10,
+      3,
       10,
       ref_fn);
 
diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
@@ -471,7 +471,7 @@ int main(int argc, char* argv[]) {
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,
       "QuantizedLinear",
-      0,
+      3,
       10,
       ref_fn);
 
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
@@ -626,8 +626,8 @@ int main(int argc, char* argv[]) {
       generate_quantized_conv2d_test_cases,
       quantized_conv2d_flop_calculator,
       "QuantizedConv2dQ8ToQ8To",
-      0,
-      1,
+      3,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
@@ -584,8 +584,8 @@ int main(int argc, char* argv[]) {
       generate_quantized_conv2d_dw_test_cases,
       quantized_conv2d_dw_flop_calculator,
       "QuantizedDepthwiseInt8Conv2d",
-      0,
-      1,
+      3,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
@@ -662,10 +662,9 @@ float collect_gpu_timing_us(ComputeGraph& graph) {
     for (const auto& shader_result : results) {
       if (shader_result.kernel_name.find("nchw_to") == std::string::npos &&
           shader_result.kernel_name.find("to_nchw") == std::string::npos &&
-          shader_result.kernel_name.find(
-              "quantize_and_pack_q8ta_conv2d_input") == std::string::npos &&
-          shader_result.kernel_name.find(
-              "unpack_and_dequantize_q8ta_conv2d_output") ==
+          shader_result.kernel_name.find("quantize_and_pack_4w4c") ==
+              std::string::npos &&
+          shader_result.kernel_name.find("unpack_4w4c_and_dequantize") ==
               std::string::npos) {
         // Calculate duration from start and end times, convert from ns to μs
         uint64_t duration_ns =

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,8 @@ void load_fp_input_tile(`
`39`	`39`	`[[unroll]] for (int w = 0; w < TILE_M; w++) {`
`40`	`40`	`if (load_tidx.data.x < input_sizes.x) {`
`41`	`41`	`tile.data[w][0] = load_fp_input_texel(load_tidx);`
	`42`	`+ } else {`
	`43`	`+ tile.data[w][0] = VEC4_T(0);`
`42`	`44`	`}`
`43`	`45`	`load_tidx.data.x++;`
`44`	`46`	`}`