Update on "[ET-VK] Add custom VkInt4WeightOnlyQuantizer for vulkan"

SS-JIA · SS-JIA · commit fcd9a211eee0 · 2024-10-15T10:47:15.000-07:00
## Context This diff adds the `VkInt4WeightOnlyQuantizer` class which enables 4-bit quantization of linear layers via source transformation. This quantizer class is copied from `torchao.quantization.GPTQ.WeightOnlyInt4Linear` with some minor changes as annotated in the implementation. Note that the pt2e quantization flow does not yet support groupwise quantization, so source transformation is the only way to perform groupwise quantization at the moment. Differential Revision: [D64406457](https://our.internmc.facebook.com/intern/diff/D64406457/) [ghstack-poisoned]
diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py
@@ -4,6 +4,10 @@
 import torch
 import torch.nn.functional as F
 
+from executorch.backends.vulkan._passes.custom_ops_defs import (  # noqa
+    linear_weight_int4_op,
+)
+
 from torchao.quantization.GPTQ import _check_linear_int4_k
 from torchao.quantization.unified import Quantizer
 from torchao.quantization.utils import groupwise_affine_quantize_tensor
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -209,11 +209,7 @@ void add_q_4w_linear_node(
   ubos.append(graph.strides_ubo(mat2));
   ubos.append(graph.strides_ubo(scales_and_zeros));
 
-  auto out_sizes = graph.sizes_of(out);
-  uint32_t N = utils::val_at(-1, out_sizes);
-  uint32_t M = utils::val_at(-2, out_sizes);
-
-  utils::uvec3 global_wg_size = {N, M, 1};
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
 
   graph.execute_nodes().emplace_back(new DispatchNode(