From 8d603f5790ca3f15701e5e8d9817dfeff49b5589 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 14 Nov 2024 08:38:11 -0800
Subject: [PATCH 1/2] [ET-VK] Enforce GPU buffer limit when partitioning

Pull Request resolved: https://github.com/pytorch/executorch/pull/6829

## Context

In Vulkan, there is a limit on the number of elements a GPU buffer can have. If a GPU buffer exceeds this limit, then the API will either produce an error or undefined behaviour will ensue.

## Changes

Along with `texture_limits`, introduce a configurable `buffer_limit` entry in the partitioner configuration.
ghstack-source-id: 253568943

Differential Revision: [D65899828](https://our.internmc.facebook.com/intern/diff/D65899828/)
---
 .../vulkan/partitioner/vulkan_partitioner.py  | 20 +++++++++++++++++--
 backends/vulkan/utils.py                      | 17 ++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 64e672fd695..cb14e96962d 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -51,11 +51,15 @@
 
 class VulkanSupportedOperators(OperatorSupportBase):
     def __init__(
-        self, texture_limits: utils.ImageExtents, require_dynamic_shape: bool = False
+        self,
+        texture_limits: utils.ImageExtents,
+        buffer_limit: int,
+        require_dynamic_shape: bool = False,
     ) -> None:
         super().__init__()
-        self.require_dynamic_shapes = require_dynamic_shape
         self.texture_limits: utils.ImageExtents = texture_limits
+        self.buffer_limit = buffer_limit
+        self.require_dynamic_shapes = require_dynamic_shape
 
     def op_node_is_compatible(
         self, node: torch.fx.Node, features: Optional[OpFeatures] = None
@@ -83,6 +87,7 @@ def op_node_is_compatible(
             node, self.texture_limits
         )
 
+        can_use_buffers = utils.within_buffer_limit(node, self.buffer_limit)
         for i, arg in enumerate(node.args):
             if (
                 isinstance(arg, torch.fx.Node)
@@ -95,10 +100,19 @@ def op_node_is_compatible(
                 valid_texture_layouts = valid_texture_layouts.intersection(
                     arg_texture_layouts
                 )
+                can_use_buffers = can_use_buffers and utils.within_buffer_limit(
+                    arg, self.buffer_limit
+                )
 
         # If there are no valid texture memory layouts, then buffer storage must be
         # supported by the operator implementation.
         if len(valid_texture_layouts) == 0:
+            if not can_use_buffers:
+                return (
+                    False,
+                    f"op requires buffers that exceed the buffer limit ({self.buffer_limit})",
+                )
+
             compatible = VkStorageType.BUFFER in features.supported_storage_types()
             reason = "op is compatible"
             if not compatible:
@@ -309,10 +323,12 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         texture_limits: utils.ImageExtents = self.options.get(
             "texture_limits", utils.DEFAULT_TEXTURE_LIMITS
         )
+        buffer_limit: int = self.options.get("buffer_limit", utils.DEFAULT_BUFFER_LIMIT)
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             VulkanSupportedOperators(
                 texture_limits,
+                buffer_limit,
                 require_dynamic_shape=self.options.get("require_dynamic_shapes", False),
             ),
             allows_single_node_partition=True,
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 2e9fbba01c7..a6db780309d 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -87,6 +87,7 @@ def is_tensor_node(node: torch.fx.Node) -> bool:
 ImageExtents = Tuple[int, int, int]
 
 DEFAULT_TEXTURE_LIMITS = (16384, 16384, 2048)
+DEFAULT_BUFFER_LIMIT = 128 * (1024 * 1024)
 
 
 class PackedDim(IntEnum):
@@ -113,6 +114,22 @@ class PackedDim(IntEnum):
 }
 
 
+def within_buffer_limit(node: torch.fx.Node, buffer_limit: int) -> int:
+    """
+    Checks whether the tensors produced by the given node can fit within the device's
+    GPU buffer limit, which represents the maximum number of elements that can be stored
+    in a GPU buffer.
+    """
+    assert is_tensor_node(node)
+
+    if isinstance(node.meta["val"], FakeTensor):
+        return node.meta["val"].numel() < buffer_limit
+    elif isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
+        return all(x.numel() < buffer_limit for x in node.meta["val"])
+    else:
+        raise RuntimeError(f"Cannot get numel for val of type {type(node.meta['val'])}")
+
+
 def required_image_extents(sizes: torch.Size, layout: VkMemoryLayout) -> ImageExtents:
     """
     Calculate the image extents that will be used to represent a tensor with the given sizes

From 5d6e50864976e14afb59a237600af32fc9d0ac9b Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 14 Nov 2024 08:38:12 -0800
Subject: [PATCH 2/2] [ET-VK][Llama] Apply XNNPACK partitoner as well when
 lowering to Vulkan

Pull Request resolved: https://github.com/pytorch/executorch/pull/6830

## Context

The final logit linear layer in the Transformer architecture has extremely large tensors, since the output and weight tensors will have a tensor with dim equal to the vocabulary size, which may be extremely large. Because of this, image textures cannot be used to execute the op when running with the Vulkan delegate, so an implementation using buffer based tensors must be used.

Unfortunately, Vulkan does not have a performant implementation of linear with buffer based tensors at the moment. As a result, if this final linear layer is executed in Vulkan, model inference is extremely slow.

## Changes

The below diff will prevent the final logit linear layer from being delegated to Vulkan by enforcing a GPU buffer limit.

This diff modifies the export llama script to apply the XNNPACK partitioner after the Vulkan partitioner if lowering to Vulkan, to ensure that remaining ops will be accelerated with XNNPACK. 4 bit quantization will also apply an additional Quantizer after applying the Vulkan quantizer (which will skip the final logit linear layer) so that the final logit linear can be quantized as well.

## Long Term

This is a temporary measure while an optimized buffer based linear implementation is developed. Once the Vulkan implementation achieves parity with XNNPACK, the final logit linear will be delegated to Vulkan once more.
ghstack-source-id: 253568942

Differential Revision: [D65899827](https://our.internmc.facebook.com/intern/diff/D65899827/)
---
 examples/models/llama/export_llama_lib.py            |  4 ++++
 .../models/llama/source_transformation/quantize.py   | 12 +++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 0e015418d42..c4334443f23 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
                 args.enable_dynamic_shape,
             )
         )
+        # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
+        partitioners.append(
+            get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
+        )
         modelname = f"vulkan_{modelname}"
 
     if args.mps:
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index d168b7efcdc..f8952ad0e53 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -157,7 +157,17 @@ def quantize(  # noqa C901
         model = gptq_quantizer.quantize(model, inputs)
         return model
     elif qmode == "vulkan_4w":
-        model = VkInt4WeightOnlyQuantizer().quantize(model)
+        q_group_size = 256 if group_size is None else group_size
+        model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
+
+        # Apply additional quantizer for linear layers that aren't lowered to Vulkan
+        # at the moment
+        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
+
+        model = Int8DynActInt4WeightQuantizer(
+            precision=torch_dtype, groupsize=q_group_size
+        ).quantize(model)
+
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")