support int32 token in runner and profiling

shewu-quic · shewu-quic · commit a603e485c307 · 2024-10-25T15:23:53.000+08:00
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -11,6 +11,9 @@
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/schema_generated.h>
+#include <chrono>
+
+// #include <fstream>
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -26,6 +29,7 @@ using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 // ========== Public method implementations =========================
 constexpr const char* QNN_COMPILE_SPEC = "qnn_compile_spec";
+// static int hi = 0;
 Result<DelegateHandle*> QnnExecuTorchBackend::init(
     BackendInitContext& context,
     FreeableBuffer* processed,
@@ -36,6 +40,11 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
 
   qnn_context_blob.buffer = const_cast<void*>(processed->data());
   qnn_context_blob.nbytes = processed->size();
+  // std::string path_ = "model_"+std::to_string(hi)+".bin";
+  // std::ofstream fout(path_, std::ios::binary);
+  // fout.write(static_cast<const char*>(processed->data()), static_cast<int64_t>(processed->size()));
+  // fout.flush();
+  // hi++;
 
   // convert CompileSpec to qnn ExecuTorch option
   for (auto& compile_spec : compile_specs) {
@@ -180,11 +189,12 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
   }
   return qnn_manager;
 }
-
+// static int qq = 0;
 Error QnnExecuTorchBackend::execute(
     BackendExecutionContext& context,
     DelegateHandle* handle,
     EValue** args) const {
+          auto begin = std::chrono::high_resolution_clock::now();
   QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
 
   std::vector<std::shared_ptr<TensorWrapper>> input_tensors =
@@ -202,6 +212,14 @@ Error QnnExecuTorchBackend::execute(
       // update data ptr only should be fine
       input_tensors[i]->FillDataBuffer(
           args[i]->toTensor().const_data_ptr(), false /* copy_data */);
+        // if(qq < input_tensors.size()){
+        //   std::string path_ = "qinput_"+std::to_string(qq)+".raw";
+        //   std::ofstream fout(path_, std::ios::binary);
+        //   fout.write(static_cast<const char*>(args[i]->toTensor().const_data_ptr()), input_tensors[i]->GetBytes());
+        //   fout.flush();
+        //   qq++;
+        // }
+
     }
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
@@ -232,7 +250,12 @@ Error QnnExecuTorchBackend::execute(
       qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to profile graph");
+    auto end = std::chrono::high_resolution_clock::now();
 
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end -
+                                                                          begin);
+    QNN_EXECUTORCH_LOG_INFO(
+        "QNN Graph Execute Time in QnnExecuTorchBackend: %ld us", elapsed.count());
   return Error::Ok;
 }
 
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -41,7 +41,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     auto tokens = from_blob(
         prompt_tokens.data(),
         {1, num_prompt_tokens},
-        exec_aten::ScalarType::Long);
+        exec_aten::ScalarType::Int);
 
     auto start_pos_tensor =
         from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
@@ -60,7 +60,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long);
+    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Int);
 
     auto start_pos_tensor =
         from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -71,7 +71,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
     // initialize tensor wrappers
     auto tokens_managed = from_blob(
-        token_data.data(), token_shape, executorch::aten::ScalarType::Long);
+        token_data.data(), token_shape, executorch::aten::ScalarType::Int);
     auto start_pos_managed =
         from_blob(&pos, {1}, executorch::aten::ScalarType::Long);
 
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -26,7 +26,8 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/schema/program_generated.h>
-
+#include <chrono>
+#include <iostream>
 namespace executorch {
 namespace runtime {
 
@@ -1004,6 +1005,7 @@ ET_NODISCARD Error Method::get_inputs(EValue* input_evalues, size_t length) {
 }
 
 Error Method::execute_instruction() {
+        auto begin = std::chrono::high_resolution_clock::now();
   auto& chain = chains_[step_state_.chain_idx];
   auto instructions = chain.s_chain_->instructions();
 
@@ -1030,6 +1032,9 @@ Error Method::execute_instruction() {
       chain.kernels_[step_state_.instr_idx](context, args.data());
       // We reset the temp_allocator after the switch statement
       err = context.failure_state();
+      auto op_index = instruction->instr_args_as_KernelCall()->op_index();
+        auto op = serialization_plan_->operators()->Get(op_index);
+      std::cout <<"run op"<<op->name()->c_str()<<std::endl;
       if (err != Error::Ok) {
         // We know that instr_args_as_KernelCall is non-null because it was
         // checked at init time.
@@ -1151,6 +1156,12 @@ Error Method::execute_instruction() {
   if (err == Error::Ok) {
     step_state_.instr_idx = next_instr_idx;
   }
+        auto end = std::chrono::high_resolution_clock::now();
+
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end -
+                                                                          begin);
+  std::cout << "instruction->instr_args_type()" << static_cast<int>(instruction->instr_args_type()) << std::endl;
+  std::cout<< "delegates_[delegate_idx].Execute Time:" <<elapsed.count() << std::endl;
   return err;
 }