Enable FLLM on static llama

DannyYuyang-quic · DannyYuyang-quic · commit c8e5cebd4492 · 2025-01-03T11:21:06.000+08:00
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -11,7 +11,8 @@
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
-
+#include <chrono>
+#include <iostream>
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -33,6 +34,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
     BackendInitContext& context,
     FreeableBuffer* processed,
     ArrayRef<CompileSpec> compile_specs) const {
+  auto start = std::chrono::high_resolution_clock::now();
   // covert SizedBuffer to qnn ExecuTorch option
   QnnExecuTorchContextBinary qnn_context_blob;
   const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr;
@@ -108,6 +110,11 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
   add_cached_delegate(signature, qnn_manager);
   // This backend does not need its processed data after Init.
   processed->Free();
+  auto end = std::chrono::high_resolution_clock::now();
+  auto int_s = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+  std::cout << "[Time consuming during init in QnnBackend] Init Time: " << int_s.count() << " milliseconds"
+            << std::endl;
   return qnn_manager;
 }
 
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -51,6 +51,11 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
     num_graphs = binaryinfo->contextBinaryInfoV2.numGraphs;
     graphs = binaryinfo->contextBinaryInfoV2.graphs;
+  #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs;
+    graphs = binaryinfo->contextBinaryInfoV3.graphs;
+#endif
   } else {
     QNN_EXECUTORCH_LOG_WARN(
         "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
@@ -62,6 +67,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
       RetrieveGraphInfo<QnnSystemContext_GraphInfoV1_t>(graphs[i].graphInfoV1);
     } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
       RetrieveGraphInfo<QnnSystemContext_GraphInfoV2_t>(graphs[i].graphInfoV2);
+    #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+    } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+      RetrieveGraphInfo<QnnSystemContext_GraphInfoV3_t>(graphs[i].graphInfoV3);
+#endif
     } else {
       QNN_EXECUTORCH_LOG_WARN(
           "Unknown QNN GraphInfo version %d.", binaryinfo->version);
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -12,7 +12,7 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
-
+#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
 #include <memory>
 namespace executorch {
 namespace backends {
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
@@ -17,35 +17,52 @@ using executorch::runtime::Error;
 Error HtpBackendCache::RetrieveBackendBinaryInfo(
     const QnnSystemContext_BinaryInfo_t* binaryinfo) {
   QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
-
+  #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr;
+#endif
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
         binaryinfo->contextBinaryInfoV1.hwInfoBlob);
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
     htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
         binaryinfo->contextBinaryInfoV2.hwInfoBlob);
+  #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    htp_graphblobinfo = static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo);
+#endif
   } else {
     QNN_EXECUTORCH_LOG_WARN(
         "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
     return Error::Internal;
   }
 
-  if (htp_hwblobinfo == nullptr) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "Htp hardware blob information is not found in binary information.");
-    return Error::Ok;
+  if (htp_hwblobinfo) {
+    if (htp_hwblobinfo->version ==
+        QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
+      spill_fill_buf_ =
+          (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
+    } else {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
+      return Error::Internal;
+    }
   }
 
-  if (htp_hwblobinfo->version ==
-      QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
-    spill_fill_buf_ =
-        (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
-  } else {
-    QNN_EXECUTORCH_LOG_WARN(
-        "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
-    return Error::Internal;
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  if (htp_graphblobinfo) {
+    if (htp_graphblobinfo->version ==
+        QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
+      spill_fill_buf_ =
+          (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+    } else {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Unknown QNN Htp graph blob info version %d.",
+          htp_graphblobinfo->version);
+      return Error::Internal;
+    }
   }
-
+#endif
   return Error::Ok;
 }
 
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -123,6 +123,9 @@ class ModelArgs:
     quantization_args: Optional[dict] = None
     lora_args: Optional[dict] = None
 
+    use_layer_norm_op: bool = False
+    use_rms_norm_op: bool = False
+
     def __post_init__(self):
         if self.n_kv_heads is None:
             self.n_kv_heads = self.n_heads
diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py
@@ -211,8 +211,15 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
             config=config, output_new_cache_only=output_new_cache_only
         )
         self.feed_forward = FeedForward(config)
-        self.attention_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
-        self.ffn_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
+        if config.use_layer_norm_op:
+            self.attention_norm = torch.nn.LayerNorm(self.dim, eps=config.norm_eps)
+            self.ffn_norm = torch.nn.LayerNorm(self.dim, eps=config.norm_eps)
+        elif config.use_rms_norm_op:
+            self.attention_norm = torch.nn.RMSNorm(self.dim, eps=config.norm_eps)
+            self.ffn_norm = torch.nn.RMSNorm(self.dim, eps=config.norm_eps)
+        else:
+            self.attention_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
+            self.ffn_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
 
     def forward(
         self,
@@ -257,7 +264,13 @@ def __init__(self, config: ModelArgs, output_new_cache_only=True):
                 for _ in range(config.n_layers)
             ]
         )
-        self.norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
+        if config.use_layer_norm_op:
+            self.norm = torch.nn.LayerNorm(config.dim, eps=config.norm_eps)
+        elif config.use_rms_norm_op:
+            self.norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
+        else:
+            self.norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
+        
         self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
         self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
         freqs_cos, freqs_sin = precompute_freqs_cis(
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -120,6 +120,7 @@ def _prefill_calibrate(
     # TODO: change criteria & support batch inputs if necessary
     token_list = sp_model.encode(user_prompts, bos=True, eos=False)
     token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1)
+    token_list = torch.where(token_list > 30000, torch.tensor(30000), token_list)
     last_prompt_pos = token_list.numel()
     if last_prompt_pos < max_cache_len:
         token_list = torch.cat(
@@ -168,6 +169,10 @@ def calibrate(
     else:
         raise RuntimeError("Get wrong inputs")
 
+def get_first_node(node):
+    if isinstance(node, tuple):
+        return get_first_node(node[0])
+    return node
 
 class SingleLlama:
     def __init__(self, llama_model, pte_filename) -> None:
@@ -199,9 +204,10 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type, sharding_type):
             if (
                 n.op == "placeholder"
                 and len(users := list(n.users)) == 1
-                and users[0].meta["val"].size()[-2:] in input_cache_shape
-            ):
-                n.meta[QCOM_QUANTIZED_IO] = kv_type
+                # and users[0].meta["val"].size()[-2:] in input_cache_shape
+            ):  
+                if get_first_node(users[0].meta["val"]).size()[-2:] in input_cache_shape:
+                    n.meta[QCOM_QUANTIZED_IO] = kv_type
             elif n.op == "output":
                 for a in n.args[0]:
                     # single head, kv mode
@@ -330,13 +336,15 @@ def compile(args, pte_filename):
         prefill_config = copy.copy(kv_config)
         prefill_config.max_seq_len = args.prefill_seq_len
         prefill_config.use_kv_cache = False
-
-    state_dict = torch.load(
-        args.checkpoint, weights_only=True, map_location="cpu", mmap=True
-    )
+    
+    # TODO: Currently, we do not load the checkpoint for FLLM
+    if args.model_arch_device == "meta":
+        state_dict = torch.load(
+            args.checkpoint, weights_only=True, map_location="cpu", mmap=True
+        )
 
     llama_instance_list = []
-    with torch.device("meta"):
+    with torch.device(args.model_arch_device):
         if args.model_mode == "kv":
             llama_instance_list.append(
                 LlamaModel(kv_config, output_new_cache_only=True)
@@ -355,15 +363,17 @@ def compile(args, pte_filename):
         else:
             raise RuntimeError(f"No such model_mode {args.model_mode}.")
 
-    if "model" in state_dict:
-        state_dict = state_dict["model"]
+    # TODO: Currently, we do not load the checkpoint for FLLM
+    if args.model_arch_device == "meta":
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
 
-    for llama_instance in llama_instance_list:
-        llama_instance.load_state_dict(
-            state_dict,
-            strict=False,
-            assign=True,
-        )
+        for llama_instance in llama_instance_list:
+            llama_instance.load_state_dict(
+                state_dict,
+                strict=False,
+                assign=True,
+            )
     end_load_ts = time.time()
     logging.info(f"Time for loading checkpoint: {end_load_ts - start_ts}")
 
@@ -689,6 +699,14 @@ def main():
         type=int,
     )
 
+    parser.add_argument(
+        "--model_arch_device",
+        help="Specify the device for the model architecture. Use 'meta' for phone LLM (default) and 'cpu' for frane LLM.",
+        default="meta",
+        choices=["meta", "cpu"],
+        type=str,
+    )
+
     args = parser.parse_args()
     if args.compile_only and args.pre_gen_pte:
         exit("Cannot set both compile_only and pre_gen_pte as true")
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
@@ -263,14 +263,15 @@ Error Runner::generate(
 
   ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
 
-  if (!system_prompt.empty()) {
-    prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
-    prompt_.append(system_prompt);
-    prompt_.append("<|eot_id|>\n");
-  }
-  prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
+  // Only use prompt provided by user
+  // if (!system_prompt.empty()) {
+  //   prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
+  //   prompt_.append(system_prompt);
+  //   prompt_.append("<|eot_id|>\n");
+  // }
+  // prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
   prompt_.append(prompt);
-  prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>");
+  // prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>");
 
   if (token_callback) {
     token_callback("<|begin_of_text|>");
@@ -280,6 +281,13 @@ Error Runner::generate(
   seq_len = (seq_len > 0 && seq_len <= max_seq_len) ? seq_len : max_seq_len;
   Result<std::vector<uint64_t>> encode_res =
       tokenizer_->encode(prompt_, n_bos_, 0);
+  if (encode_res.ok()) {
+      for (auto& id : encode_res.get()) {
+        if (id > 30000) {
+          id = static_cast<uint64_t>(30000);
+        }
+      }
+    } 
   ET_CHECK_OK_OR_RETURN_ERROR(
       encode_res.error(), "failed to encode prompt %s", prompt_.c_str());