Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions examples/models/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <executorch/examples/models/llama/runner/runner.h>

#include <algorithm>
#include <ctime>

#include <executorch/extension/llm/runner/util.h>
Expand Down Expand Up @@ -221,11 +222,11 @@ Error Runner::generate(

ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
ET_CHECK_MSG(
num_prompt_tokens < metadata_.at(kMaxSeqLen),
num_prompt_tokens < metadata_.at(kMaxContextLen),
"num_prompt_tokens %d >= max_seq_len_ %" PRId64
", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
num_prompt_tokens,
metadata_.at(kMaxSeqLen));
metadata_.at(kMaxContextLen));
ET_CHECK_MSG(
num_prompt_tokens < seq_len,
"num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
Expand All @@ -241,11 +242,26 @@ Error Runner::generate(
wrapped_callback(prompt);
}
int64_t pos = 0;
auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
uint64_t cur_token;
int max_seq_len = metadata_.at(kMaxSeqLen) -
1; // -1 because for some reason tracing results in this upperbound
int num_tokens_to_process = 0;
while (num_tokens_to_process < num_prompt_tokens) {
auto num_tokens_to_prefill_with =
std::min(num_prompt_tokens - num_tokens_to_process, max_seq_len);
std::vector<uint64_t> prompt_tokens_to_process(num_tokens_to_prefill_with);
std::copy(
prompt_tokens.begin() + num_tokens_to_process,
prompt_tokens.begin() + num_tokens_to_process + num_tokens_to_prefill_with,
prompt_tokens_to_process.begin());
auto prefill_res =
text_prefiller_->prefill(prompt_tokens_to_process, pos);
ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
cur_token = prefill_res.get();
num_tokens_to_process += num_tokens_to_prefill_with;
}
stats_.first_token_ms = llm::time_in_ms();
stats_.prompt_eval_end_ms = llm::time_in_ms();
ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
uint64_t cur_token = prefill_res.get();

// print the first token from prefill. No prev_token so use cur_token for it.
wrapped_callback(
Expand Down
Loading