Qualcomm AI Engine Direct - Optimize QNN embedding op for llama

shewu-quic · shewu-quic · commit 73e2f7cdc025 · 2024-11-10T20:25:48.000+08:00
summary:
- Change the dtype of the token from int64 to in32
Int32 is Qnn HTP friendly. It will significantly speed up qnn embedding operations due to matching backend optimizations.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -519,6 +519,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             checkpoint=checkpoint_path,
             checkpoint_dir=checkpoint_dir,
             params_path=params_path,
+            use_int32_token=True if args.qnn else False,
             use_kv_cache=args.use_kv_cache,
             use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
             generate_full_logits=args.generate_full_logits,
@@ -746,6 +747,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
 def _load_llama_model_metadata(
     weight_type: WeightType,
+    use_int32_token: bool,
     use_kv_cache: bool,
     use_sdpa_with_kv_cache: bool,
     enable_dynamic_shape: bool,
@@ -759,6 +761,7 @@ def _load_llama_model_metadata(
         "get_max_seq_len": model_args.max_seq_len,
         "get_n_layers": model_args.n_layers,
         "get_vocab_size": model_args.vocab_size,
+        "use_int32_token": use_int32_token,
         "use_kv_cache": use_kv_cache,
         "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache,
         "enable_dynamic_shape": enable_dynamic_shape,
@@ -779,6 +782,7 @@ def _load_llama_model(
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
     params_path: str,
+    use_int32_token: bool = False,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
     generate_full_logits: bool = False,
@@ -852,6 +856,10 @@ def _load_llama_model(
         else:
             raise ValueError(f"Unsupported dtype {dtype}")
 
+    if use_int32_token:
+        token = example_inputs[0].to(torch.int32)
+        example_inputs = (token,) + example_inputs[1:]
+
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
@@ -870,6 +878,7 @@ def _load_llama_model(
         verbose=verbose,
         metadata=_load_llama_model_metadata(
             weight_type,
+            use_int32_token,
             use_kv_cache,
             use_sdpa_with_kv_cache,
             enable_dynamic_shape,
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -34,6 +34,7 @@ static constexpr auto kMaxSeqLen = "get_max_seq_len";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+static constexpr auto kUseInt32Token = "use_int32_token";
 } // namespace
 
 Runner::Runner(
@@ -51,6 +52,7 @@ Runner::Runner(
           {kMaxSeqLen, 128},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
+          {kUseInt32Token, true},
       }) {
   ET_LOG(
       Info,
@@ -127,12 +129,14 @@ Error Runner::load() {
       temperature_);
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
+      metadata_.at(kUseInt32Token),
       metadata_.at(kUseKVCache),
       metadata_.at(kEnableDynamicShape));
 
   text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
+      metadata_.at(kUseInt32Token),
       metadata_.at(kUseKVCache),
       std::move(eos_ids),
       &stats_);
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -17,9 +17,11 @@ namespace llm {
 
 TextPrefiller::TextPrefiller(
     TextDecoderRunner* text_decoder_runner,
+    bool use_int32_token,
     bool use_kv_cache,
     bool enable_parallel_prefill)
     : text_decoder_runner_(text_decoder_runner),
+      use_int32_token_(use_int32_token),
       use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill) {}
 
@@ -36,12 +38,13 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
 
   // store the token
   uint64_t cur_token;
+  exec_aten::ScalarType token_type = use_int32_token_
+      ? exec_aten::ScalarType::Int
+      : exec_aten::ScalarType::Long;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
     // initialize tensor wrappers
-    auto tokens = from_blob(
-        prompt_tokens.data(),
-        {1, num_prompt_tokens},
-        exec_aten::ScalarType::Long);
+    auto tokens =
+        from_blob(prompt_tokens.data(), {1, num_prompt_tokens}, token_type);
 
     auto start_pos_tensor =
         from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
@@ -60,7 +63,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long);
+    auto tokens = from_blob(&cur_token, {1, 1}, token_type);
 
     auto start_pos_tensor =
         from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -24,6 +24,7 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
+      bool use_int32_token,
       bool use_kv_cache_,
       bool enable_parallel_prefill);
   /**
@@ -40,6 +41,7 @@ class ET_EXPERIMENTAL TextPrefiller {
 
  private:
   TextDecoderRunner* text_decoder_runner_;
+  bool use_int32_token_;
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
 };
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -23,12 +23,14 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   TextTokenGenerator(
       Tokenizer* tokenizer,
       TextDecoderRunner* text_decoder_runner,
+      bool use_int32_token,
       bool use_kv_cache,
       std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
       Stats* stats)
       : tokenizer_(tokenizer),
         text_decoder_runner_(text_decoder_runner),
         eos_ids_(std::move(eos_ids)),
+        use_int32_token_(use_int32_token),
         use_kv_cache_(use_kv_cache),
         stats_(stats) {}
 
@@ -54,6 +56,9 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
     std::vector<uint64_t> token_data; // allocate space for the tokens
     std::vector<executorch::aten::SizesType> token_shape;
+    exec_aten::ScalarType token_type = use_int32_token_
+        ? exec_aten::ScalarType::Int
+        : exec_aten::ScalarType::Long;
 
     // Token after prefill
     uint64_t cur_token = tokens.back();
@@ -70,8 +75,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     }
 
     // initialize tensor wrappers
-    auto tokens_managed = from_blob(
-        token_data.data(), token_shape, executorch::aten::ScalarType::Long);
+    auto tokens_managed = from_blob(token_data.data(), token_shape, token_type);
     auto start_pos_managed =
         from_blob(&pos, {1}, executorch::aten::ScalarType::Long);
 
@@ -133,6 +137,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   Tokenizer* tokenizer_;
   TextDecoderRunner* text_decoder_runner_;
   std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
+  bool use_int32_token_;
   bool use_kv_cache_;
 
   // state machine