Add warmup for Llama (#5756)

digantdesai · facebook-github-bot · commit 660ef7761087 · 2024-09-30T19:02:18.000-07:00
Summary: Load the model. Run the everything twice. Reset stats in between two runs. Also decrease logging level for warm up. Notes: Tested on Android and Mac. With Llama2 and Llama3 - with temperature=0 produces same output. This warm up option is disabled by default. This is inspired from llama.cpp options [[1](https://github.com/ggerganov/llama.cpp/blob/ea9c32be71b91b42ecc538bd902e93cbb5fb36cb/common/common.cpp#L897-L929), [2](https://github.com/ggerganov/llama.cpp/blob/ea9c32be71b91b42ecc538bd902e93cbb5fb36cb/examples/llama-bench/llama-bench.cpp#L1595-L1602)]. Sample [runs](https://www.internalfb.com/phabricator/paste/view/P1613261035) Pull Request resolved: pytorch/executorch#5756 Reviewed By: mcr229, metascroy Differential Revision: D63642723 Pulled By: digantdesai fbshipit-source-id: 39ff257eda182fff423f90582a9f32387cfdb253
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -213,7 +213,7 @@ echo "Creating tokenizer.bin"
 $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 
 
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
+RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
 # Check build tool.
 echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
 if [[ "${BUILD_TOOL}" == "buck2" ]]; then
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .hypothesis
 buck-out/
-cmake-out/
+cmake-out*
+.DS_Store
 cmake-android-out/
 cmake-out-android/
 cmake-ios-out/
diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp
@@ -39,6 +39,8 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
+DEFINE_bool(warmup, false, "Whether to run a warmup run.");
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -57,6 +59,8 @@ int32_t main(int32_t argc, char** argv) {
 
   int32_t cpu_threads = FLAGS_cpu_threads;
 
+  bool warmup = FLAGS_warmup;
+
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
       ? torch::executorch::cpuinfo::get_num_performant_cores()
@@ -71,6 +75,9 @@ int32_t main(int32_t argc, char** argv) {
   // create llama runner
   example::Runner runner(model_path, tokenizer_path, temperature);
 
+  if (warmup) {
+    runner.warmup(prompt, seq_len);
+  }
   // generate
   runner.generate(prompt, seq_len);
 
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -146,12 +146,21 @@ Error Runner::load() {
   return Error::Ok;
 }
 
+// Don't print with the same priority during warmup
+#define RUNNER_ET_LOG(warmup, format, ...) \
+  if (warmup) {                            \
+    ET_LOG(Debug, format, __VA_ARGS__);    \
+  } else {                                 \
+    ET_LOG(Info, format, __VA_ARGS__);     \
+  }
+
 Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const llm::Stats&)> stats_callback,
-    bool echo) {
+    bool echo,
+    bool warmup) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -161,16 +170,22 @@ Error Runner::generate(
     stats_.model_load_end_ms = llm::time_in_ms();
   }
 
-  ET_LOG(
-      Info,
+  if (warmup) {
+    ET_LOG(Info, "Doing a warmup run...");
+  }
+
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after loading model: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
-      [token_callback](const std::string& piece) {
-        llm::safe_printf(piece.c_str());
-        fflush(stdout);
+      [token_callback, warmup](const std::string& piece) {
+        if (!warmup) {
+          llm::safe_printf(piece.c_str());
+          fflush(stdout);
+        }
         if (token_callback) {
           token_callback(piece);
         }
@@ -228,8 +243,8 @@ Error Runner::generate(
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
-  ET_LOG(
-      Info,
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
@@ -239,26 +254,46 @@ Error Runner::generate(
       prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
 
   stats_.inference_end_ms = llm::time_in_ms();
-  printf("\n");
-  ET_LOG(
-      Info,
+  if (!warmup) {
+    printf("\n");
+  }
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
-    ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
+    RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
-  ::executorch::llm::print_report(stats_);
+
+  if (warmup) {
+    ET_LOG(Info, "Warmup run finished!");
+  } else {
+    // Do not print report during warmup
+    ::executorch::llm::print_report(stats_);
+  }
   if (stats_callback) {
     stats_callback(stats_);
   }
 
   return Error::Ok;
 }
 
+Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
+  Error err = generate(
+      prompt,
+      seq_len,
+      /*token_callback=*/nullptr,
+      /*stats_callbak=*/nullptr,
+      /*echo=*/false,
+      /*warmup=*/true);
+  stats_.reset();
+  return err;
+}
+
 void Runner::stop() {
   if (is_loaded()) {
     text_token_generator_->stop();
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -41,7 +41,11 @@ class Runner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
           stats_callback = {},
-      bool echo = true);
+      bool echo = true,
+      bool warming = false);
+  ::executorch::runtime::Error warmup(
+      const std::string& prompt,
+      int32_t seq_len = 128);
   void stop();
 
  private:
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
@@ -52,6 +52,19 @@ struct Stats {
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
+  void reset() {
+    model_load_start_ms = 0;
+    model_load_end_ms = 0;
+    inference_start_ms = 0;
+    prompt_eval_end_ms = 0;
+    first_token_ms = 0;
+    inference_end_ms = 0;
+    aggregate_sampling_time_ms = 0;
+    num_prompt_tokens = 0;
+    num_generated_tokens = 0;
+    aggregate_sampling_timer_start_timestamp = 0;
+  }
+
  private:
   long aggregate_sampling_timer_start_timestamp = 0;
 };