diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h index e2d2bc40c60..06fbffbef83 100644 --- a/examples/models/llama/runner/static_attention_io_manager.h +++ b/examples/models/llama/runner/static_attention_io_manager.h @@ -586,12 +586,12 @@ class StaticAttentionIOManager { * of the prompt and method's input length. Returns the position in the output * that corresponds to the end of the prompt during the last inference. */ - template + template size_t prefill( executorch::runtime::Span tokens, executorch::runtime::Span input_buffer, executorch::runtime::Method& method, - std::function)> + std::function)> logits_callback = nullptr) { ET_LOG(Info, "Prefilling at position %zu", input_pos_); size_t input_len = input_buffer.size(); @@ -619,7 +619,7 @@ class StaticAttentionIOManager { batch_len); if (logits_callback) { auto logits_tensor = method.get_output(0).toTensor(); - auto* logits = logits_tensor.const_data_ptr(); + auto* logits = logits_tensor.const_data_ptr(); logits_callback(executorch::runtime::Span( logits, logits + batch_len * logits_tensor.size(logits_tensor.dim() - 1)));