Update on "[ET-VK] Fix implementation of int4 quantized linear"

SS-JIA · SS-JIA · commit 2bff3c76b49a · 2024-10-14T16:14:49.000-07:00
## Context Fix the existing implementation of int4 weight quantized linear to conform with how the `_weight_int4packed_mm` op works in the ATen library. For some additional context, the current op implementation does not actually match the behaviour of `_weight_int4packed_mm`. The ATen op expects that the weights have already been packed into a specific format, with `inner_k_tiles` as a packing parameter. The packing is accomplished via calling the `_convert_weight_to_int4pack` operator. Thus the current implementation in vulkan is equivalent to calling `_convert_weight_to_int4pack` + `_weight_int4packed_mm`. To address this discrepancy, the operator implementation is registered under the `linear_weight_int4` custom op as of this diff. The problems with the existing implementation were as follows: * The expected sizes of the scales and zeros tensor was incorrect. Previously, the sizes were assumed to be `(2, N, num_groups)` but the correct size is `(num_groups, N, 2)` * Previously, when unpacking a uint8_t into 2 unpacked int4 values, it was assumed that the LSB was the first value and the MSB was the second value. However, this ordering should be flipped * The original implementation expected the output tensor to be channels packed, but in practice we want the output tensor to be width packed This diff addresses the above issues, and introduces a dedicated test binary to test against an equivalent reference implementation expressed with ATen functions. Differential Revision: [D64354773](https://our.internmc.facebook.com/intern/diff/D64354773/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -474,7 +474,7 @@ vTensor::vTensor(
 
   if (dtype == vkapi::kHalf) {
     VK_CHECK_COND(
-        api::context()->adapter_ptr()->has_16bit_storage(),
+        api::context()->adapter_ptr()->supports_16bit_storage_buffers(),
         "Half dtype is only available if the physical device supports float16 "
         "storage buffers!");
   }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -436,7 +436,7 @@ class vTensor final {
    * dim is mapped to the height axis of the texture, the channels dim is mapped
    * to the depth axis of the texture.
    */
-  inline bool is_standard_axis_map() const {
+  inline bool has_standard_axis_map() const {
     return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
   }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -342,8 +342,8 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().axis_map_ubo();
   }
 
-  inline bool is_standard_axis_map(const ValueRef idx) {
-    return values_.at(idx).toTensor().is_standard_axis_map();
+  inline bool has_standard_axis_map(const ValueRef idx) {
+    return values_.at(idx).toTensor().has_standard_axis_map();
   }
 
   inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
@@ -694,6 +694,10 @@ class ComputeGraph final {
   // Miscellaneous Utilities
   //
 
+  inline bool int16_shader_types_enabled() const {
+    return context_->adapter_ptr()->supports_int16_shader_types();
+  }
+
   /*
    * Check whether the GPU supports 8 bit buffers.
    */
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -127,6 +127,8 @@ void check_q_4w_linear_args(
     const ValueRef group_size,
     const ValueRef scales_and_zeros,
     const ValueRef out) {
+  VK_CHECK_COND(graph.int16_shader_types_enabled());
+
   VK_CHECK_COND(graph.val_is_tensor(mat1));
   VK_CHECK_COND(graph.val_is_tref(mat2_data));
   VK_CHECK_COND(graph.val_is_tref(scales_and_zeros));
@@ -145,8 +147,8 @@ void check_q_4w_linear_args(
   VK_CHECK_COND(graph.packed_dim_of(mat1) == WHCN::kWidthDim);
   VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim);
 
-  VK_CHECK_COND(graph.is_standard_axis_map(mat1));
-  VK_CHECK_COND(graph.is_standard_axis_map(out));
+  VK_CHECK_COND(graph.has_standard_axis_map(mat1));
+  VK_CHECK_COND(graph.has_standard_axis_map(out));
 }
 
 void resize_q_4w_linear_node(
@@ -201,19 +203,10 @@ void add_q_4w_linear_node(
   const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
 
   vkapi::ParamsBindList ubos({});
-  if (storage_type == utils::kBuffer) {
-    ubos.append(graph.sizes_ubo(out));
-    ubos.append(graph.strides_ubo(out));
-    ubos.append(graph.sizes_ubo(mat1));
-    ubos.append(graph.strides_ubo(mat1));
-    ubos.append(graph.strides_ubo(mat2));
-    ubos.append(graph.strides_ubo(scales_and_zeros));
-  } else {
-    ubos.append(graph.logical_limits_ubo(out));
-    ubos.append(graph.sizes_ubo(mat1));
-    ubos.append(graph.strides_ubo(mat2));
-    ubos.append(graph.strides_ubo(scales_and_zeros));
-  }
+  ubos.append(graph.logical_limits_ubo(out));
+  ubos.append(graph.sizes_ubo(mat1));
+  ubos.append(graph.strides_ubo(mat2));
+  ubos.append(graph.strides_ubo(scales_and_zeros));
 
   auto out_sizes = graph.sizes_of(out);
   uint32_t N = utils::val_at(-1, out_sizes);
@@ -248,7 +241,10 @@ void linear_weight_int4(
       args[1], // mat2
       args[2], // group_size
       args[3], // scales_and_zeros
-      args[4] // out
+      // There is an unused variable inner_k_tiles which is used to call
+      // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th
+      // argument is skipped.
+      args[5] // out
   );
 }
 
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -155,30 +155,34 @@ class Adapter final {
 
   // Physical Device Features
 
-  inline bool has_16bit_storage() {
+  inline bool supports_16bit_storage_buffers() {
     return physical_device_.shader_16bit_storage.storageBuffer16BitAccess ==
         VK_TRUE;
   }
 
-  inline bool has_8bit_storage() {
+  inline bool supports_8bit_storage_buffers() {
     return physical_device_.shader_8bit_storage.storageBuffer8BitAccess ==
         VK_TRUE;
   }
 
-  inline bool has_16bit_compute() {
+  inline bool supports_float16_shader_types() {
     return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE;
   }
 
-  inline bool has_8bit_compute() {
+  inline bool supports_int8_shader_types() {
     return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE;
   }
 
+  inline bool supports_int16_shader_types() {
+    return physical_device_.supports_int16_shader_types;
+  }
+
   inline bool has_full_float16_buffers_support() {
-    return has_16bit_storage() && has_16bit_compute();
+    return supports_16bit_storage_buffers() && supports_float16_shader_types();
   }
 
   inline bool has_full_int8_buffers_support() {
-    return has_8bit_storage() && has_8bit_compute();
+    return supports_16bit_storage_buffers() && supports_int8_shader_types();
   }
 
   // Command Buffer Submission
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -30,6 +30,7 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
           VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR},
       queue_families{},
       num_compute_queues(0),
+      supports_int16_shader_types(false),
       has_unified_memory(false),
       has_timestamps(properties.limits.timestampComputeAndGraphics),
       timestamp_period(properties.limits.timestampPeriod),
@@ -49,6 +50,10 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
 
   vkGetPhysicalDeviceFeatures2(handle, &features2);
 
+  if (features2.features.shaderInt16 == VK_TRUE) {
+    supports_int16_shader_types = true;
+  }
+
   // Check if there are any memory types have both the HOST_VISIBLE and the
   // DEVICE_LOCAL property flags
   const VkMemoryPropertyFlags unified_memory_flags =
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
@@ -35,6 +35,7 @@ struct PhysicalDevice final {
 
   // Metadata
   uint32_t num_compute_queues;
+  bool supports_int16_shader_types;
   bool has_unified_memory;
   bool has_timestamps;
   float timestamp_period;
diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
@@ -176,6 +176,7 @@ void test_vulkan_linear_int4(
     r_weights_4x2,
     graph.add_scalar<int64_t>(group_size),
     r_scales_and_zeros,
+    kDummyValueRef,
     r_out});
 
   ValueRef staging_out = graph.set_output_tensor(r_out);
@@ -210,6 +211,9 @@ TEST(VulkanSDPATest, test_reference_impl) {
 }
 
 TEST(VulkanSDPATest, test_vulkan_impl) {
+  if (!vkcompute::api::context()->adapter_ptr()->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
   test_vulkan_linear_int4(
       /*B = */ 1,
       /*M = */ 4,

Original file line number	Diff line number	Diff line change
`@@ -474,7 +474,7 @@ vTensor::vTensor(`
`474`	`474`
`475`	`475`	`if (dtype == vkapi::kHalf) {`
`476`	`476`	`VK_CHECK_COND(`
`477`		`- api::context()->adapter_ptr()->has_16bit_storage(),`
	`477`	`+ api::context()->adapter_ptr()->supports_16bit_storage_buffers(),`
`478`	`478`	`"Half dtype is only available if the physical device supports float16 "`
`479`	`479`	`"storage buffers!");`
`480`	`480`	`}`
Original file line number	Diff line number	Diff line change
`@@ -436,7 +436,7 @@ class vTensor final {`
`436`	`436`	`* dim is mapped to the height axis of the texture, the channels dim is mapped`
`437`	`437`	`* to the depth axis of the texture.`
`438`	`438`	`*/`
`439`		`- inline bool is_standard_axis_map() const {`
	`439`	`+ inline bool has_standard_axis_map() const {`
`440`	`440`	`return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;`
`441`	`441`	`}`
`442`	`442`
Original file line number	Diff line number	Diff line change
`@@ -155,30 +155,34 @@ class Adapter final {`
`155`	`155`
`156`	`156`	`// Physical Device Features`
`157`	`157`
`158`		`- inline bool has_16bit_storage() {`
	`158`	`+ inline bool supports_16bit_storage_buffers() {`
`159`	`159`	`return physical_device_.shader_16bit_storage.storageBuffer16BitAccess ==`
`160`	`160`	`VK_TRUE;`
`161`	`161`	`}`
`162`	`162`
`163`		`- inline bool has_8bit_storage() {`
	`163`	`+ inline bool supports_8bit_storage_buffers() {`
`164`	`164`	`return physical_device_.shader_8bit_storage.storageBuffer8BitAccess ==`
`165`	`165`	`VK_TRUE;`
`166`	`166`	`}`
`167`	`167`
`168`		`- inline bool has_16bit_compute() {`
	`168`	`+ inline bool supports_float16_shader_types() {`
`169`	`169`	`return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE;`
`170`	`170`	`}`
`171`	`171`
`172`		`- inline bool has_8bit_compute() {`
	`172`	`+ inline bool supports_int8_shader_types() {`
`173`	`173`	`return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE;`
`174`	`174`	`}`
`175`	`175`
	`176`	`+ inline bool supports_int16_shader_types() {`
	`177`	`+ return physical_device_.supports_int16_shader_types;`
	`178`	`+ }`
	`179`	`+`
`176`	`180`	`inline bool has_full_float16_buffers_support() {`
`177`		`- return has_16bit_storage() && has_16bit_compute();`
	`181`	`+ return supports_16bit_storage_buffers() && supports_float16_shader_types();`
`178`	`182`	`}`
`179`	`183`
`180`	`184`	`inline bool has_full_int8_buffers_support() {`
`181`		`- return has_8bit_storage() && has_8bit_compute();`
	`185`	`+ return supports_16bit_storage_buffers() && supports_int8_shader_types();`
`182`	`186`	`}`
`183`	`187`
`184`	`188`	`// Command Buffer Submission`