qio + sha + cpu quantized embedding +r1r2 cuda

shewu-quic · shewu-quic · commit 37c471dbee3e · 2024-10-25T15:23:33.000+08:00
diff --git a/backends/qualcomm/_passes/i64_to_i32.py b/backends/qualcomm/_passes/i64_to_i32.py
@@ -61,7 +61,12 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
                         to_dst_node.meta["val"] = node_val.to(torch.int32)
 
                         # Replace usage of the src dtype result with the dst dtype result.
-                        n.replace_all_uses_with(to_dst_node)
+                        if n.name != "tokens":
+                            n.replace_all_uses_with(to_dst_node)
+                        else:
+                            for user in n.users.copy():
+                                if user.name != "quantized_decomposed_embedding_4bit_dtype":
+                                    user.replace_input_with(n, to_dst_node)
                         to_dst_node.args = (n,)
 
     def call(self, graph_module: torch.fx.GraphModule):
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -14,6 +14,7 @@
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.slice_scatter.default,
     exir_ops.edge.aten.copy.default,
+    exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,
 ]
 
 to_be_implemented_operator = [
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -12,6 +12,7 @@
     QuantizationConfig,
 )
 from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
     SharedQuantizationSpec,
@@ -144,3 +145,35 @@ def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
+
+
+def get_custom_quant_ios_dtype(
+    cache_shape: torch.Size,
+    node: torch.fx.Node,
+    kv_dtype=torch.uint8,
+    sharding_dtype=torch.uint16,
+):
+    """
+    This function is specific for llama inputs and outputs
+    """
+    if node.op == "placeholder" and "attention_sdpa_kv_cache_past_" in node.name:
+        return kv_dtype
+
+    # Tag index put node before copy node, because copy is a skipped node in qnn
+    if (
+        exir_ops.edge.aten.index_put.default == node.target
+        and node.meta["val"].shape == cache_shape
+    ):
+        return kv_dtype
+
+    # Tag sharding io
+    if exir_ops.edge.llama.fallback.default in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
+
+    # Tag index op as quantized tensors. It is caused by sharding
+    if exir_ops.edge.aten.index.Tensor in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -71,6 +71,7 @@
     QCOM_PASS_EXPAND_BROADCAST_SHAPE,
     QCOM_PASS_SKIP_ADVANCED_REQUANT,
     QCOM_QNN_COMPILE_SPEC,
+    QCOM_QUANTIZED_IO,
 )
 
 from executorch.exir import ExirExportedProgram
@@ -876,3 +877,12 @@ def get_soc_to_chipset_map():
         "SM8475": QcomChipset.SM8475,
         "SM8450": QcomChipset.SM8450,
     }
+
+
+def tag_quant_io(gm: torch.fx.GraphModule, get_quant_io_dtype_fn: Callable):
+    """
+    Tag io nodes which get/output quantized tensor. No need to insert q/dq in qnn_preprocess
+    """
+    for node in gm.graph.nodes:
+        if dtype := get_quant_io_dtype_fn(node):
+            node.meta[QCOM_QUANTIZED_IO] = dtype
diff --git a/examples/models/llama/export_llama.py b/examples/models/llama/export_llama.py
@@ -7,11 +7,14 @@
 # Example script for exporting Llama2 to flatbuffer
 
 import logging
+import sys
 
 import torch
 
 from .export_llama_lib import build_args_parser, export_llama
 
+sys.setrecursionlimit(4096)
+
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -50,6 +50,8 @@
     fuse_layer_norms,
     get_model_with_r1_r2,
 )
+
+from .source_transformation.attention import replace_attention_to_attention_sha
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
@@ -174,6 +176,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
     )
 
+    parser.add_argument(
+        "--use_qnn_sha",
+        action="store_true",
+        help="Change multi head attention to multiple single head attention for qnn backend (Qualcomm)",
+    )
+
     parser.add_argument(
         "--calibration_tasks",
         nargs="+",
@@ -642,7 +650,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
             )
         )
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
-        from executorch.backends.qualcomm.utils.utils import _transform
+        from executorch.backends.qualcomm.utils.utils import _transform, tag_quant_io
 
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
         _transform(builder_exported_to_edge.edge_manager.exported_program())
@@ -654,7 +662,32 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
                 builder_exported_to_edge.metadata["get_n_layers"],
                 shares=args.num_sharding,
             )
+        from functools import partial
 
+        from executorch.backends.qualcomm.quantizer.custom_annotation import (
+            get_custom_quant_ios_dtype,
+        )
+        atten = builder_exported_to_edge.model.layers[0].attention
+        if args.use_qnn_sha:
+            cache_shape = torch.Size(
+                (atten.max_batch_size, atten.max_seq_len, atten.head_dim)
+            )
+        else:
+            cache_shape = torch.Size(
+                (
+                    atten.max_batch_size,
+                    atten.max_seq_len,
+                    atten.n_kv_heads,
+                    atten.head_dim,
+                )
+            )
+        tag_quant_io(
+            builder_exported_to_edge.edge_manager.exported_program().graph_module,
+            partial(
+                get_custom_quant_ios_dtype,
+                cache_shape,
+            ),
+        )
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
@@ -919,15 +952,27 @@ def _get_source_transforms(  # noqa
                 convert_linear_to_conv2d,
             )
 
-            transforms.append(replace_kv_cache_with_simple_kv_cache)
-            transforms.append(replace_sdpa_with_flex_sdpa)
-            transforms.append(replace_causal_mask)
-            transforms.append(replace_rms_norm_with_native_rms_norm)
-            if args.optimized_rotation_path:
-                transforms.append(fuse_layer_norms)
-                transforms.append(get_model_with_r1_r2(args.optimized_rotation_path))
-            # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
-            transforms.append(convert_linear_to_conv2d)
+            if args.use_qnn_sha:
+                if args.optimized_rotation_path:
+                    transforms.append(fuse_layer_norms)
+                    transforms.append(
+                        get_model_with_r1_r2(args.optimized_rotation_path)
+                    )
+                transforms.append(replace_attention_to_attention_sha)
+                transforms.append(replace_causal_mask)
+                transforms.append(replace_rms_norm_with_native_rms_norm)
+                transforms.append(convert_linear_to_conv2d)
+            else:
+                transforms.append(replace_kv_cache_with_simple_kv_cache)
+                transforms.append(replace_sdpa_with_flex_sdpa)
+                transforms.append(replace_causal_mask)
+                transforms.append(replace_rms_norm_with_native_rms_norm)
+                if args.optimized_rotation_path:
+                    transforms.append(fuse_layer_norms)
+                    transforms.append(
+                        get_model_with_r1_r2(args.optimized_rotation_path)
+                    )
+                transforms.append(convert_linear_to_conv2d)
 
         elif args.mps:
             # Currently mps doesn't support sdpa op, use the simpler decomposition
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -263,21 +263,22 @@ class Attention(nn.Module):
     def __init__(self, args: ModelArgs, layer_id: int):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert args.n_heads % self.n_kv_heads == 0
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert self.n_heads % self.n_kv_heads == 0
         model_parallel_size = 1
-        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_heads = self.n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
+        self.head_dim = args.dim // self.n_heads
         self.max_batch_size = args.max_batch_size
         self.max_seq_len = args.max_seq_len
         self.dim = args.dim
-        # args.dim = 4096, args.n_heads = 32, self.head_dim = 4096 / 32 = 125
-        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        # args.dim = 4096, self.n_heads = 32, self.head_dim = 4096 / 32 = 125
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
 
diff --git a/examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py
@@ -98,7 +98,7 @@ def get_model_with_r1_r2(optimized_rotation_path: str):
 
 
 def apply_spin_quant_r1_r2(model: torch.nn.Module, optimized_rotation_path: str):
-    optimized_rotation = torch.load(optimized_rotation_path, weights_only=True)
+    optimized_rotation = torch.load(optimized_rotation_path, weights_only=True, map_location=torch.device('cpu'))
     R1 = optimized_rotation["R1"].to(torch.float32)
     config = model.params
     num_heads = config.n_heads
diff --git a/examples/models/llama/source_transformation/attention.py b/examples/models/llama/source_transformation/attention.py
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`exir_ops.edge.aten.full.default,`
`15`	`15`	`exir_ops.edge.aten.slice_scatter.default,`
`16`	`16`	`exir_ops.edge.aten.copy.default,`
	`17`	`+ exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,`
`17`	`18`	`]`
`18`	`19`
`19`	`20`	`to_be_implemented_operator = [`