pytorch
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 34 additions & 1 deletion b/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎backends/qualcomm/utils/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/utils/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/qualcomm/CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion b/‎examples/qualcomm/CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎examples/qualcomm/oss_scripts/llama2/llama.py‎
100644100755
Lines changed: 0 additions & 3 deletions b/‎examples/qualcomm/oss_scripts/llama2/llama.py‎
100644100755
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama2/model/static_llama.py‎
100644100755
Lines changed: 46 additions & 64 deletions b/‎examples/qualcomm/oss_scripts/llama2/model/static_llama.py‎
100644100755
Lines changed: 46 additions & 64 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt‎
Lines changed: 94 additions & 0 deletions b/‎examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt‎
Lines changed: 94 additions & 0 deletions
@@ -11,7 +11,10 @@
     get_default_8bit_qnn_ptq_config,
     QuantizationConfig,
 )
-from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
+from executorch.backends.qualcomm.quantizer.utils import (
+    get_ptq_per_channel_quant_config,
+    QUANT_ANNOTATION_KEY,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -121,6 +124,36 @@ def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
                     annotate_matmul_input1(node.args[1], quantization_config_8a8w)
 
 
+def custom_annotate_llama_last_conv_16a8w(gm: torch.fx.GraphModule) -> None:
+    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        weight = node.args[1]
+        input_qspec_map[weight] = quantization_config.weight
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8
+    )
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
+            if "nn_module_stack" in node.meta:
+                module_values_list = list(node.meta["nn_module_stack"].values())
+                full_qualified_name = module_values_list[0][0]
+                if full_qualified_name == "L['self'].llama.output":
+                    annotate_conv2d(
+                        node, quantization_config=quantization_config_16a8w_per_channel
+                    )
+
+
 def custom_annotate_matmul_16a8w(gm: torch.fx.GraphModule):
     """
     Annotate matmul op with 16a8w quantization config
 
@@ -331,7 +331,7 @@ def _transform(
 def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
-    custom_pass_config: Set[str] = None,
+    custom_pass_config: Set[str] = frozenset(),
 ) -> exir.ExirExportedProgram:
     ep = torch.export.export(module, inputs)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
 
@@ -66,12 +66,30 @@ target_include_directories(
   full_portable_ops_lib PUBLIC ${_common_include_directories}
 )
 
+# find RE2 for tokenizer
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
+  ${CMAKE_CURRENT_BINARY_DIR}/re2
+)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
 # build qnn_executor_runner
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner)
 
-# build qnn_llama_runner
+# build qnn_llama_runner for llama2
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2)
 
+# build qnn_llama_runner for llama3.2
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama3_2)
+
 # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama)
 
 
@@ -333,9 +333,6 @@ def lowering_modules(
     def get_example_inputs(self):
         return self.llama_model.get_example_inputs()
 
-    def get_export_inputs(self):
-        return self.llama_model.get_export_inputs()
-
 
 def compile(args):
     os.makedirs(args.artifact, exist_ok=True)
 
@@ -17,6 +17,20 @@
 )
 
 
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
 def apply_rotary_emb_single(
     x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
 ) -> torch.Tensor:
@@ -59,13 +73,13 @@ def prepare_sha(self):
         self.wk_sha = nn.ModuleList(
             [
                 nn.Linear(self.dim, self.head_dim, bias=False)
-                for _ in range(self.n_heads)
+                for _ in range(self.n_kv_heads)
             ]
         )
         self.wv_sha = nn.ModuleList(
             [
                 nn.Linear(self.dim, self.head_dim, bias=False)
-                for _ in range(self.n_heads)
+                for _ in range(self.n_kv_heads)
             ]
         )
 
@@ -76,6 +90,7 @@ def prepare_sha(self):
             self.wq_sha[i].weight.data.copy_(
                 self.wq.weight[i * self.head_dim : (i + 1) * self.head_dim]
             )
+        for i in range(self.n_kv_heads):
             self.wk_sha[i].weight.data.copy_(
                 self.wk.weight[i * self.head_dim : (i + 1) * self.head_dim]
             )
@@ -97,30 +112,27 @@ def forward_sha(
         v = [wv_sha(hidden_states) for wv_sha in self.wv_sha]
         for i in range(len(q)):
             q[i] = apply_rotary_emb_single(q[i], freqs_cos, freqs_sin)
+        for i in range(len(k)):
             k[i] = apply_rotary_emb_single(k[i], freqs_cos, freqs_sin).permute(0, 2, 1)
 
-        output_kh, output_vh, output_y = [], [], []
+        output_y = []
+        kh, vh = [], []
         for i, _ in enumerate(k_caches):
-            # cat at the seq dim
-            kh = torch.cat([k_caches[i], k[i]], dim=-1)
-            vh = torch.cat([v_caches[i], v[i]], dim=1)
+            kh.append(torch.cat([k_caches[i], k[i]], dim=-1))
+            vh.append(torch.cat([v_caches[i], v[i]], dim=1))
 
-            attn = q[i] @ kh
+        for i, _ in enumerate(q):
+            cache_idx = i // self.num_key_value_groups
+            attn = q[i] @ kh[cache_idx]
             attn = attn / self.scale + atten_mask
             attn = self.attn_softmax(attn)
-            y = attn @ vh
+            y = attn @ vh[cache_idx]
 
-            if self.output_new_cache_only:
-                output_kh.append(k[i])
-                output_vh.append(v[i])
-            else:
-                output_kh.append(kh)
-                output_vh.append(vh)
             output_y.append(y)
 
         y = torch.concat(output_y, dim=-1)
         y = self.wo(y)
-        return y, output_kh, output_vh
+        return y, k, v
 
     def forward(
         self,
@@ -142,24 +154,28 @@ def forward(
         k = apply_rotary_emb_single(k, freqs_cos, freqs_sin).permute(0, 2, 3, 1)
 
         output_kh, output_vh, output_y = [], [], []
-
+        kh, vh = [], []
         for i, _ in enumerate(k_caches):
-            # cat at the seq dim
-            kh = torch.cat([k_caches[i], k[:, i, :, :]], dim=-1)
-            vh = torch.cat([v_caches[i], v[:, :, i, :]], dim=1)
+            kh.append(torch.cat([k_caches[i], k[:, i, :, :]], dim=-1))
+            vh.append(torch.cat([v_caches[i], v[:, :, i, :]], dim=1))
+
+        for i in range(self.n_heads):
+            cache_idx = i // self.num_key_value_groups
 
-            attn = q[:, :, i, :] @ kh
+            attn = q[:, :, i, :] @ kh[cache_idx]
             attn = attn / self.scale + atten_mask
             attn = self.attn_softmax(attn)
-            y = attn @ vh
+            y = attn @ vh[cache_idx]
 
+            output_y.append(y)
+
+        for i in range(len(k_caches)):
             if self.output_new_cache_only:
                 output_kh.append(k[:, i, :, :])
                 output_vh.append(v[:, :, i, :])
             else:
-                output_kh.append(kh)
-                output_vh.append(vh)
-            output_y.append(y)
+                output_kh.append(kh[i])
+                output_vh.append(vh[i])
 
         y = torch.concat(output_y, dim=-1)
         y = self.wo(y)
@@ -246,10 +262,10 @@ def forward(
 
         hidden_states = self.tok_embeddings(tokens)
         for ind, decoder_layer in enumerate(self.layers):
-            offset_k = ind * self.n_heads
-            offset_v = self.n_layers * self.n_heads + offset_k
-            k_caches = args[offset_k : offset_k + self.n_heads]
-            v_caches = args[offset_v : offset_v + self.n_heads]
+            offset_k = ind * self.n_kv_heads
+            offset_v = self.n_layers * self.n_kv_heads + offset_k
+            k_caches = args[offset_k : offset_k + self.n_kv_heads]
+            v_caches = args[offset_v : offset_v + self.n_kv_heads]
             hidden_states, k, v = decoder_layer(
                 hidden_states,
                 freqs_cos=freqs_cos,
@@ -275,7 +291,7 @@ def get_example_inputs(self):
         atten_mask = torch.full((self.max_batch_size, self.max_seq_len), -255.0)
         atten_mask[:, -1] = 0
         for _ in range(self.n_layers):
-            for _ in range(self.n_heads):
+            for _ in range(self.n_kv_heads):
                 # transpose first to decrease the runtime efforts
                 k_cache.append(
                     torch.zeros(
@@ -299,40 +315,6 @@ def get_example_inputs(self):
             v_cache,
         )
 
-    def get_export_inputs(self):
-        tokens = torch.randint(
-            self.vocab_size, (self.max_batch_size, 1), dtype=torch.int32
-        )
-        pos_ids = torch.zeros((self.max_batch_size, 1), dtype=torch.int32)
-        # this is important for torch.export not to take it as dummy input
-        k_cache, v_cache = [], []
-        atten_mask = torch.full((self.max_batch_size, self.max_seq_len), -255.0)
-        atten_mask[:, -1] = 0
-        for _ in range(self.n_layers):
-            for _ in range(self.n_heads):
-                # transpose first to decrease the runtime efforts
-                k_cache.append(
-                    torch.randn(
-                        self.max_batch_size,
-                        self.head_dim,
-                        self.max_seq_len - 1,
-                    )
-                )
-                v_cache.append(
-                    torch.randn(
-                        self.max_batch_size,
-                        self.max_seq_len - 1,
-                        self.head_dim,
-                    )
-                )
-        return (
-            tokens,
-            pos_ids,
-            atten_mask,
-            k_cache,
-            v_cache,
-        )
-
     def get_metadata(self):
         # TODO: modify this when enabling LLAMA 7B
         return {
@@ -344,7 +326,7 @@ def get_metadata(self):
             "get_max_seq_len": self.max_seq_len,
             "get_n_bos": 1,
             "get_n_eos": 1,
-            "get_n_kv_heads": self.n_heads,
+            "get_n_kv_heads": self.n_kv_heads,
             "get_n_layers": self.n_layers,
             "get_vocab_size": self.vocab_size,
         }
@@ -0,0 +1,94 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# model sharding with custom op
+set(CUSTOM_OP_SRCS_FILE 
+  "${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
+)
+add_library(custom_ops ${CUSTOM_OP_SRCS_FILE})
+target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
+target_include_directories(
+  custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../include"
+)
+target_link_libraries(
+  custom_ops PUBLIC full_portable_ops_lib
+)
+target_link_options_shared_lib(custom_ops)
+
+# preprocess qnn runner src files for llama3.2
+set(_llama3_2_runner__srcs ${_llama_runner__srcs})
+list(TRANSFORM _llama3_2_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _llama3_2_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
+list(
+  PREPEND
+  _llama3_2_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qnn_llama3_2_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
+)
+
+list(
+  APPEND _llama3_2_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+)
+list(
+  APPEND
+  _llama3_2_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp
+)
+
+# build qnn llama3.2 1b runner
+add_executable(qnn_llama3_2_1b_runner ${_llama3_2_runner__srcs})
+target_include_directories(
+  qnn_llama3_2_1b_runner PUBLIC ${_common_include_directories}
+)
+
+target_link_libraries(
+  qnn_llama3_2_1b_runner
+  qnn_executorch_backend
+  executorch_core
+  extension_data_loader
+  extension_module
+  extension_tensor
+  gflags
+  re2::re2
+  custom_ops
+)
+target_compile_options(
+  qnn_llama3_2_1b_runner PUBLIC ${_common_compile_options}
+)
+set_target_properties(
+  qnn_llama3_2_1b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
+
+
+# build qnn llama3.2 3b runner
+add_executable(qnn_llama3_2_3b_runner ${_llama3_2_runner__srcs})
+target_include_directories(
+  qnn_llama3_2_3b_runner PUBLIC ${_common_include_directories}
+)
+# Adding compile option to differentiate llama3.2 1b with 3b
+target_compile_options(qnn_llama3_2_3b_runner PRIVATE -DLLAMA3_2_3B_RUNNER)
+
+target_link_libraries(
+  qnn_llama3_2_3b_runner
+  qnn_executorch_backend
+  executorch_core
+  extension_data_loader
+  extension_module
+  extension_tensor
+  gflags
+  re2::re2
+  custom_ops
+)
+target_compile_options(
+  qnn_llama3_2_3b_runner PUBLIC ${_common_compile_options}
+)
+set_target_properties(
+  qnn_llama3_2_3b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)