@@ -24,7 +24,8 @@ TextPrefiller::TextPrefiller(
2424 : text_decoder_runner_(text_decoder_runner),
2525 use_kv_cache_ (use_kv_cache),
2626 enable_parallel_prefill_(enable_parallel_prefill),
27- max_seq_len_(max_seq_len > 0 ? max_seq_len - 1 : 127 ) {} // -1 because for some reason tracing results in this upperbound
27+ max_seq_len_(max_seq_len > 0 ? max_seq_len - 1 : 127 ) {
28+ } // -1 because for some reason tracing results in this upperbound
2829
2930::executorch::runtime::Result<uint64_t > TextPrefiller::prefill (
3031 std::vector<uint64_t >& prompt_tokens,
@@ -33,33 +34,35 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
3334 if (!text_decoder_runner_->is_method_loaded ()) {
3435 ET_CHECK_OK_OR_RETURN_ERROR (text_decoder_runner_->load ());
3536 }
36-
37+
3738 // Check if we need to chunk the prompt tokens
3839 int32_t num_prompt_tokens = prompt_tokens.size ();
39-
40+
4041 // If prompt tokens exceed max_seq_len_, we need to chunk them
4142 if (num_prompt_tokens > max_seq_len_) {
4243 uint64_t cur_token = 0 ;
4344 int num_tokens_to_process = 0 ;
44-
45+
4546 while (num_tokens_to_process < num_prompt_tokens) {
46- auto num_tokens_to_prefill_with =
47- std::min<int >(num_prompt_tokens - num_tokens_to_process, max_seq_len_);
48-
49- std::vector<uint64_t > prompt_tokens_to_process (num_tokens_to_prefill_with);
47+ auto num_tokens_to_prefill_with = std::min<int >(
48+ num_prompt_tokens - num_tokens_to_process, max_seq_len_);
49+
50+ std::vector<uint64_t > prompt_tokens_to_process (
51+ num_tokens_to_prefill_with);
5052 std::copy (
5153 prompt_tokens.begin () + num_tokens_to_process,
52- prompt_tokens.begin () + num_tokens_to_process + num_tokens_to_prefill_with,
54+ prompt_tokens.begin () + num_tokens_to_process +
55+ num_tokens_to_prefill_with,
5356 prompt_tokens_to_process.begin ());
54-
57+
5558 // Process this chunk
5659 auto chunk_result = prefillChunk (prompt_tokens_to_process, start_pos);
5760 ET_CHECK_OK_OR_RETURN_ERROR (chunk_result.error ());
5861 cur_token = chunk_result.get ();
59-
62+
6063 num_tokens_to_process += num_tokens_to_prefill_with;
6164 }
62-
65+
6366 return cur_token;
6467 } else {
6568 // If prompt tokens don't exceed max_seq_len_, process them directly
0 commit comments