Update

SS-JIA · SS-JIA · commit bb55ae35f923 · 2024-10-04T13:56:34.000-04:00
[ghstack-poisoned]
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -118,3 +118,29 @@ def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
                 if "SDPA" in full_qualified_name:
                     annotate_matmul(node, quantization_config_16a8w)
                     annotate_matmul_input1(node.args[1], quantization_config_8a8w)
+
+
+def custom_annotate_matmul_16a8w(gm: torch.fx.GraphModule):
+    """
+    Annotate matmul op with 16a8w quantization config
+    """
+
+    def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+        input_act1 = node.args[1]
+        input_spec1 = quantization_config.weight
+        input_qspec_map[input_act1] = input_spec1
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    # Annotate 16a8w for matmul op to get better performance
+    quantization_config_16a8w = get_16a8w_qnn_ptq_config()
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
+            annotate_matmul(node, quantization_config_16a8w)
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -33,6 +33,7 @@ class QcomChipset(IntEnum):
     SM8450 = 36  # v69
     SM8475 = 42  # v69
     SM8550 = 43  # v73
+    SSG2115P = 46  # v73
     SM8650 = 57  # v75
 
 
@@ -47,6 +48,7 @@ class SocInfo:
     QcomChipset.SM8475: SocInfo(QcomChipset.SM8475, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SM8550: SocInfo(QcomChipset.SM8550, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SM8650: SocInfo(QcomChipset.SM8650, HtpInfo(HtpArch.V75, 8)),
+    QcomChipset.SSG2115P: SocInfo(QcomChipset.SSG2115P, HtpInfo(HtpArch.V73, 2)),
 }
 
 
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
@@ -32,6 +32,7 @@ enum QcomChipset: int {
   SM8450 = 36,
   SM8475 = 42,
   SM8550 = 43,
+  SSG2115P = 46,
   SM8650 = 57,
 }
 
@@ -170,7 +171,7 @@ table QnnExecuTorchOptions {
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
-  
+
   /// Enables usage of shared buffer between application and backend for graph I/O.
   shared_buffer:bool;
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -118,6 +118,7 @@ class TestQNN(unittest.TestCase):
     model: QcomChipset = None
     compiler_specs: List[CompileSpec] = None
     arch_table = {
+        "SSG2115P": QcomChipset.SSG2115P,
         "SM8650": QcomChipset.SM8650,
         "SM8550": QcomChipset.SM8550,
         "SM8475": QcomChipset.SM8475,
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -48,6 +48,7 @@
 
 
 soc_to_chipset_map = {
+    "SSG2115P": QcomChipset.SSG2115P,
     "SM8650": QcomChipset.SM8650,
     "SM8550": QcomChipset.SM8550,
     "SM8475": QcomChipset.SM8475,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
@@ -83,6 +83,7 @@ def __init__(
         self.debug_output_path = f"{self.workspace}/debug_output.bin"
         self.output_folder = f"{self.workspace}/outputs"
         self.arch_table = {
+            "SSG2115P": "73",
             "SM8650": "75",
             "SM8550": "73",
             "SM8475": "69",